In [None]:
import json
import pandas as pd
import re 


In [None]:
with open('majorprogram3.json', encoding="utf8") as data_file:
    programs_j = json.loads(data_file.read())

#see how many records are in the set    
len(programs_j["records"])  

In [None]:
print(programs_j["records"][0])

In [None]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    s = s.get_data()
    #after stripping html, strip the extra white space junk
    s = strip_extra_space(s)
    return s

def strip_extra_space(inText):
    #replace new lines with hat
    s = inText.replace('\n', '^')
    #get rid of extra white space
    s = ' '.join(s.split())

    #replace 2 hats with new lines again
    s = re.sub('\^\^+', '\n', s)
    #replace single hat with a space
    s = re.sub('\^', ' ', s)
    #replace newline and space with 2 newlines
    s = re.sub('\n +', '\n\n', s)
       
    return s

In [None]:
first = programs_j["records"][0]
print(strip_tags(first['field_174_raw']))



In [None]:
def get_attribute(data, attribute, default_value):
    return data.get(attribute) or default_value

In [None]:
#create a list to hold cleaned up stuff
listOfLists = []
for p in programs_j["records"]:
    #create a new list
    thisProgram = []
    #append the id
    thisProgram.append(p['id'])
    #append the title (field 169)
    thisProgram.append(p['field_169'])
    #the author is field 170, and the name is buried in the raw identifier
    thisProgram.append(p['field_170_raw'][0]['identifier'])
    
    #field 180 is the home office location, and can be multiple or blank
    HOLS = get_attribute(p, 'field_180_raw', [])
    homeoffices = []
    for HOL in HOLS:
        homeoffices.append(HOL['identifier'])
    if len(homeoffices) > 0:
        thisProgram.append(','.join(homeoffices)) 
    else:
        thisProgram.append(' ')
    
    #field 232 is "Does this cover statewide work?"
    thisProgram.append(p['field_232'])
    #filed 233 is "Does this cover multi-state work?
    thisProgram.append(p['field_233'])
    
    #field 171 is institute and can be multiple or blank
    institutes_field = get_attribute(p, 'field_171_raw', [])
    institutes = []
    for i in institutes_field:
        institutes.append(i['identifier'])
    if len(institutes) > 0:
        thisProgram.append(','.join(institutes)) 
    else:
        thisProgram.append(' ')
    
    #field 172 is program1 and can be multiple or blank
    programs1_field = get_attribute(p, 'field_172_raw', [])
    programs1 = []
    for p1 in programs1_field:
        programs1.append(p1['identifier'])
    if len(programs1) > 0:
        thisProgram.append(','.join(programs1)) 
    else:
        thisProgram.append(' ')
    
    #field 173 is program2 and can be multiple
    programs2_field = get_attribute(p, 'field_173_raw', [])
    programs2 = []
    for p2 in programs2_field:
        programs2.append(p2['identifier'])
    if len(programs2) > 0:    
        thisProgram.append(','.join(programs2))
    else:
        thisProgram.append(' ')
        
    #field 174 is the situation statement
    situation = get_attribute(p, 'field_174', ' ')
    thisProgram.append(situation)
    #also provide it without html
    thisProgram.append(strip_tags(situation))
    
    #field 175 is the target audience
    audience = get_attribute(p, 'field_175', ' ')
    thisProgram.append(audience)
    #also provide it without html
    thisProgram.append(strip_tags(audience))
    
    #field 176 is the outcome
    outcome = get_attribute(p, 'field_176', ' ')
    thisProgram.append(outcome)
    #also provide it without html
    thisProgram.append(strip_tags(outcome))
    
    #field 178 is the action plan
    action = get_attribute(p, 'field_178', ' ')
    thisProgram.append(action)
    #also provide it without html
    thisProgram.append(strip_tags(action))
    
    #field 191 is the professional development
    prodev = get_attribute(p, 'field_191', ' ')
    thisProgram.append(prodev)
    #also provide it without html
    thisProgram.append(strip_tags(prodev))
    
     #field 179 is the eval plan
    evalplan = get_attribute(p, 'field_179', ' ')
    thisProgram.append(evalplan)
    #also provide it without html
    thisProgram.append(strip_tags(evalplan))
    
    listOfLists.append(thisProgram)

#make a dataframe
df = pd.DataFrame(listOfLists, columns=['ID','Title','Author','Home Office','Statewide Work', 
                                        'Multi-State Work', 'Institute', 'Program Level 1', 'Program Level 2',
                                       'Situation Statement', 'Situation Statement Clean', 
                                        'Target Audience', 'Target Audience Clean',
                                       'Outcome', 'Outcome Clean',
                                       'Action Plan', 'Action Plan Clean',
                                       'Professional Development', 'Professional Development Clean',
                                       'Evaluation Plan', 'Evaluation Plan Clean'])
df    

In [None]:
df.to_csv('PlansOfWork.csv', sep='|')
