In [1]:
import json
import pandas as pd
import re 


In [2]:
with open('majorprogram3.json', encoding="utf8") as data_file:
    programs_j = json.loads(data_file.read())

#see how many records are in the set    
len(programs_j["records"])  

414

In [3]:
print(programs_j["records"][0])

{'id': '5e34a13407c4f40015253bb4', 'field_169': ' Asset mapping for community betterment (2020)', 'field_169_raw': ' Asset mapping for community betterment (2020)', 'field_170': '<span class="5c37a44afa4f340862abcf30">Barry Hottmann</span>', 'field_170_raw': [{'id': '5c37a44afa4f340862abcf30', 'identifier': 'Barry Hottmann'}], 'field_180': '<span class="5a61fa2c34767a37ac9fe96c">Iowa</span>', 'field_180_raw': [{'id': '5a61fa2c34767a37ac9fe96c', 'identifier': 'Iowa'}], 'field_171': '<span class="5a61fbeaf5cc965888d0800f">Community Development</span>', 'field_171_raw': [{'id': '5a61fbeaf5cc965888d0800f', 'identifier': 'Community Development'}], 'field_172': '<span class="5a61fc11ade5a26b5fc94b0f">Community Economic Development</span>', 'field_172_raw': [{'id': '5a61fc11ade5a26b5fc94b0f', 'identifier': 'Community Economic Development'}], 'field_173': '', 'field_173_raw': [], 'field_174': '<p class="MsoNormal">Social capital is the combination of networks, engagement,\nand willingness to i

In [4]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    s = s.get_data()
    #after stripping html, strip the extra white space junk
    s = strip_extra_space(s)
    return s

def strip_extra_space(inText):
    #replace new lines with hat
    s = inText.replace('\n', '^')
    #get rid of extra white space
    s = ' '.join(s.split())

    #replace 2 hats with new lines again
    s = re.sub('\^\^+', '\n', s)
    #replace single hat with a space
    s = re.sub('\^', ' ', s)
    #replace newline and space with 2 newlines
    s = re.sub('\n +', '\n\n', s)
       
    return s

In [5]:
first = programs_j["records"][0]
print(strip_tags(first['field_174_raw']))



Social capital is the combination of networks, engagement, and willingness to invest in community, thus allowing residents to work together for the community’s mutual benefit.  Though difficult to measure directly, secondary data can serve as a proxy and provide some insights. Of the four measures analyzed in the WIndicator – Social Capital in Wisconsin [1], the focus for this programming is the number of associations (networks are built through interaction) and the number of non-profits (a signal of engaged, mission-driven community members) within our communities.Associations provide members of the community the opportunity to interact with one another, building and strengthening both networks and, perhaps, trust within the county. In addition, the creation of these associations reflects the willingness and ability of people within the community to invest in themselves and the wider community. Like that of associations, non-profits are also opportunities to network and invest in a co

In [6]:
def get_attribute(data, attribute, default_value):
    return data.get(attribute) or default_value

In [7]:
#create a list to hold cleaned up stuff
listOfLists = []
for p in programs_j["records"]:
    #create a new list
    thisProgram = []
    #append the id
    thisProgram.append(p['id'])
    #append the title (field 169)
    thisProgram.append(p['field_169'])
    #the author is field 170, and the name is buried in the raw identifier
    thisProgram.append(p['field_170_raw'][0]['identifier'])
    
    #field 180 is the home office location, and can be multiple or blank
    HOLS = get_attribute(p, 'field_180_raw', [])
    homeoffices = []
    for HOL in HOLS:
        homeoffices.append(HOL['identifier'])
    if len(homeoffices) > 0:
        thisProgram.append(','.join(homeoffices)) 
    else:
        thisProgram.append(' ')
    
    #field 232 is "Does this cover statewide work?"
    thisProgram.append(p['field_232'])
    #filed 233 is "Does this cover multi-state work?
    thisProgram.append(p['field_233'])
    
    #field 171 is institute and can be multiple or blank
    institutes_field = get_attribute(p, 'field_171_raw', [])
    institutes = []
    for i in institutes_field:
        institutes.append(i['identifier'])
    if len(institutes) > 0:
        thisProgram.append(','.join(institutes)) 
    else:
        thisProgram.append(' ')
    
    #field 172 is program1 and can be multiple or blank
    programs1_field = get_attribute(p, 'field_172_raw', [])
    programs1 = []
    for p1 in programs1_field:
        programs1.append(p1['identifier'])
    if len(programs1) > 0:
        thisProgram.append(','.join(programs1)) 
    else:
        thisProgram.append(' ')
    
    #field 173 is program2 and can be multiple
    programs2_field = get_attribute(p, 'field_173_raw', [])
    programs2 = []
    for p2 in programs2_field:
        programs2.append(p2['identifier'])
    if len(programs2) > 0:    
        thisProgram.append(','.join(programs2))
    else:
        thisProgram.append(' ')
        
    #field 174 is the situation statement
    situation = get_attribute(p, 'field_174', ' ')
    thisProgram.append(situation)
    #also provide it without html
    thisProgram.append(strip_tags(situation))
    
    #field 175 is the target audience
    audience = get_attribute(p, 'field_175', ' ')
    thisProgram.append(audience)
    #also provide it without html
    thisProgram.append(strip_tags(audience))
    
    #field 176 is the outcome
    outcome = get_attribute(p, 'field_176', ' ')
    thisProgram.append(outcome)
    #also provide it without html
    thisProgram.append(strip_tags(outcome))
    
    #field 178 is the action plan
    action = get_attribute(p, 'field_178', ' ')
    thisProgram.append(action)
    #also provide it without html
    thisProgram.append(strip_tags(action))
    
    #field 191 is the professional development
    prodev = get_attribute(p, 'field_191', ' ')
    thisProgram.append(prodev)
    #also provide it without html
    thisProgram.append(strip_tags(prodev))
    
     #field 179 is the eval plan
    evalplan = get_attribute(p, 'field_179', ' ')
    thisProgram.append(evalplan)
    #also provide it without html
    thisProgram.append(strip_tags(evalplan))
    
    listOfLists.append(thisProgram)

#make a dataframe
df = pd.DataFrame(listOfLists, columns=['ID','Title','Author','Home Office','Statewide Work', 
                                        'Multi-State Work', 'Institute', 'Program Level 1', 'Program Level 2',
                                       'Situation Statement', 'Situation Statement Clean', 
                                        'Target Audience', 'Target Audience Clean',
                                       'Outcome', 'Outcome Clean',
                                       'Action Plan', 'Action Plan Clean',
                                       'Professional Development', 'Professional Development Clean',
                                       'Evaluation Plan', 'Evaluation Plan Clean'])
df    

Unnamed: 0,ID,Title,Author,Home Office,Statewide Work,Multi-State Work,Institute,Program Level 1,Program Level 2,Situation Statement,...,Target Audience,Target Audience Clean,Outcome,Outcome Clean,Action Plan,Action Plan Clean,Professional Development,Professional Development Clean,Evaluation Plan,Evaluation Plan Clean
0,5e34a13407c4f40015253bb4,Asset mapping for community betterment (2020),Barry Hottmann,Iowa,No,No,Community Development,Community Economic Development,,"<p class=""MsoNormal"">Social capital is the com...",...,<p>Community leaders interested in building ca...,Community leaders interested in building capac...,<p>When completed in full (which may take seve...,When completed in full (which may take several...,<p><b>Short-term (mid 2019 – late 2020)</b></p...,Short-term (mid 2019 – late 2020)Develop a cor...,<ul><li>Employ the resources of John McKnight ...,Employ the resources of John McKnight from the...,"<ul><li>The development of a list, and the she...","The development of a list, and the sheer numbe..."
1,5e2f1e9813223b00165f9fb5,Building the capacity of emerging leaders to ...,Jennifer Erickson,Sauk,No,No,Community Development,Organizational and Leadership Development,,<p>Communities are most effective for the citi...,...,<p>Sauk County Institute of\nLeadership (SCIL)...,Sauk County Institute of Leadership (SCIL) boa...,<p>SCIL participants will indicate\nthrough ev...,SCIL participants will indicate through evalua...,"<p><u>First Quarter</u></p>\n\n<p class=""MsoLi...","First Quarter\n· Develop January, February an...",<p>I try to take advantage of any opportunity ...,I try to take advantage of any opportunity tha...,"<p>· \nFollowing\neach SCIL session, pa...","· Following each SCIL session, participants c..."
2,5e349034eef79c00157721e8,"Educational Outreach in Efficient, Profitable...",Nick Baker,Rock,Yes,No,Agriculture,"Crops and Soils,Farm Management",,"<p><b> </b>There are over\n1,500 farms in Rock...",...,<p>The intended Audience will be any and all c...,The intended Audience will be any and all crop...,<p>Local grain producers and agricultural\npro...,Local grain producers and agricultural profess...,"<p>As technology increases, production systems...","As technology increases, production systems ch...",<p>Having a strong research background has aid...,Having a strong research background has aided ...,<p>Field based research programing will be eva...,Field based research programing will be evalua...
3,5e320b9c1595c1001516cea7,"""My Woods"" Pilot Program- Mell",Randy Mell,La Crosse,Yes,No,Natural Resources,Land and Water Program,Regional Natural Resources Education Program,<p>Leading the nation in paper production and ...,...,"<p class=""MsoNormal"" style=""margin-bottom:0in;...",Program Priority #1 & Target Audience(s): Agri...,<p><b>Program\nPriority #1 Outcomes:</b></p>\n...,Program Priority #1 Outcomes:\nLong Term Condi...,<p>Outcomes (short and medium term outcomes):<...,Outcomes (short and medium term outcomes):\nA ...,I will\nneed someone from the Natural Resource...,I will need someone from the Natural Resource ...,<p>The focus group\ninformation to be collecte...,The focus group information to be collected an...
4,5e33885907c4f4001523bf05,(2020) Behavioral Health Programmaing,Jennifer Park-Mroch,Administration/Operations (non-programming sta...,Yes,Yes,Health and Well-Being,Behavioral Health,,"<p>Youth Mental Health</p><span id=""docs-inter...",...,"<span id=""docs-internal-guid-2ff08735-7fff-2be...",Program PrioritiesYouth Mental Health TeamProv...,"<p>Program Priority #1 </p><span id=""docs-inte...","Program Priority #1 By the end of 2021, the Le...","<p>Program Priority #1</p><span id=""docs-inter...",Program Priority #1Continue to monitor how the...,<p>I would like to learn what other community ...,I would like to learn what other community bas...,<p>Administration Records will be used to dete...,Administration Records will be used to determi...
5,5e345c4ceef79c001576b770,2019 Year in Review,M Carol McCartney,UW Madison-Based,Yes,No,Natural Resources,Wisconsin Geological and Natural History Survey,,The Year in Review is an annual report of WGNH...,...,"UW-Madison Division of Extension management, l...","UW-Madison Division of Extension management, l...","""Identify yourself and stae your value"" Marv V...","""Identify yourself and stae your value"" Marv V...",Direct email distribution of link to online ve...,Direct email distribution of link to online ve...,<p>Esri has changed their sotry map platform a...,Esri has changed their sotry map platform and ...,<p>I would be able to use them and explain to ...,I would be able to use them and explain to WGN...
6,5e31e2e812eaa90015feea2b,2020 - Expanding Access,Anna DeMers,Washburn,No,No,Positive Youth Development,Community Youth Development,,"<p class=""MsoNormal"">Wisconsin’s racial minori...",...,<p>-all students in Washburn County</p><p>-adu...,-all students in Washburn County-adults that w...,"<p class=""MsoNormal""><u>Individual Level Outco...",Individual Level Outcomes\nAs a result of (or ...,"<p class=""MsoListParagraphCxSpFirst"" style=""te...",· Identify and reach Hispanic youth in Washbu...,"<p class=""MsoNormal"" style=""margin-bottom:12.0...",Gain a better understanding of the underserved...,"<p class=""MsoListParagraphCxSpFirst"" style=""te...","· Interviews with partners, including St. Cro..."
7,5e1655a2688b2e001872e4ef,2020 Capacity Building and Leadership with Wat...,Dale Mohr,Oconto,No,No,Community Development,Community Economic Development,,"<p style=""margin: 0in 0.25in 0.0001pt; backgro...",...,<p><b><u>Program Priority #1</u></b><b><u> Cou...,Program Priority #1 County Wide Lakes Manageme...,"<p class=""MsoNormal"" style=""margin-bottom: 0.0...",OutcomesProgram Priority #1 Outcomes: A develo...,<p><b>Action Plan\n(Response/Planned Activitie...,Action Plan (Response/Planned Activities)Progr...,<p>· \nIt is envisioned that\nPD would i...,· It is envisioned that PD would include any ...,"<p><b>Evaluation Plan</b></p><p class=""MsoNorm...",Evaluation PlanProgram Priority #1 Evaluation ...
8,5e16592d31f02f001a7ece5b,2020 Continue Multi-Year Program: Restoritive ...,Dale Mohr,Oconto,No,No,Community Development,Organizational and Leadership Development,,"<p class=""MsoNormal"" style=""margin: 0in 12.75p...",...,"<p class=""MsoNormal"" style=""margin-bottom: 0.0...",Target AudienceProgram Priority #1 TEEN COURT ...,"<p class=""MsoNormal"" style=""margin: 0in 2.9pt ...",Program Priority Outcomes: A well-established ...,"<p class=""MsoNormal"" style=""margin-bottom: 0.0...",Action Plan (Response/Planned Activities)Progr...,"<p>Seek out programs on\nConflict Resolution, ...","Seek out programs on Conflict Resolution, TEEN...","<p class=""MsoNormal"" style=""margin: 0in -6.25p...",Program Priority Evaluation Plan· Conduct a p...
9,5e336e6db2cef8001736007d,2020 Green Industry Programming,Lisa Johnson,Dane,Yes,No,Agriculture,Horticulture,,<p>Green Industry\n(commercial horticulture) p...,...,<p>The target\naudience encompasses a wide ran...,The target audience encompasses a wide range o...,"<p class=""MsoNormal"" style=""margin-bottom:12.0...",· Increase knowledge levels and meet needs suc...,"<p>· <span lang=""EN"">I will plan, coordinate, ...","· I will plan, coordinate, host and advertise ...",<p>I will work with Language Access and Suppor...,I will work with Language Access and Support t...,"<p>1. \n<span lang=""EN"">Local survey at\nt...",1. Local survey at the end of class for knowl...


In [8]:
df.to_csv('PlansOfWork.csv', sep='|')
