In [60]:
#File:getdata.ipynb
#Author: Rafer Cooley
#Desc:notebook for functions associated with loading raw data into pandas dataframes
from IPython.display import HTML
import pandas as pd
import os, sys
import base64 #for createdownloadlink

##Files
data_folder = '../new-data/'
census_folder = data_folder+'census-info/'
population_data_file = census_folder+'census-factfinder/census-cleaned.csv'
demographic_data_file = data_folder+'ojjdp/population-data/combined.csv'
dci_data_folder = data_folder+'DCI/all-combined/'
dci_index_crimes = dci_data_folder+'index-offenses/'
dfs_afcars_data_file = data_folder+'DFS/DFS_12-16-AFCARS-CLEAN.csv'
dfs_county_data_file = data_folder+'DFS/DFS_12-16-Placements-ByCounty-CLEAN.csv'
dfs_plc_data_file = data_folder+'DFS/DFS_12-16-Placements-ByPLC-CLEAN.csv'
school_discipline_data_file = data_folder+'school-discipline/SchoolDiscipline_2007-17_Combined-CSV2.csv'
ori_data_file = data_folder+'juvenile-arrests/ori_juvenile_arrest_2010-2016_CLEAN.csv'
placements_folder = data_folder+'DFS/placements/'
placements_data_file = placements_folder+'combined1.csv'
case_count_data_file = data_folder+'case-counts/combined.csv'
##End Files

class DataFunctions:
    #found:https://stackoverflow.com/questions/31893930/download-csv-from-an-ipython-notebook
    def create_download_link(self,df, title = "Download CSV file", filename = "data.csv"):
        csv1 = df.to_csv()
        b64 = base64.b64encode(csv1.encode())
        payload = b64.decode()
        html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
        html = html.format(payload=payload,title=title,filename=filename)
        return HTML(html)
    #end found

    ##Get Data Functs
    def getOverview(self):#DONE
        gather_overview_frames = []
        current_path = dci_data_folder+'overview/'
        year_files = next(os.walk(current_path))[2]
        for f in year_files:#process each file for this year
            file_path = current_path+'/'+f
            year = f.split('-')[0]
            try:
                df = pd.read_csv(file_path,sep=',',header='infer',index_col=0)
                df.assign(Year=year)
                gather_overview_frames.append(df)
            except Exception as e:
                print('cant load file: '+file_path)
                print('['+str(e)+']')
                print('*****************************************************************************************')
        return pd.concat(gather_overview_frames)

    def getDFS(self):
        afcars = pd.read_csv(dfs_afcars_data_file,sep=',',header='infer',index_col=[0,1])
        #afcars.set_index('COUNTY')
        county = pd.read_csv(dfs_county_data_file,sep=',',header='infer',index_col=[0,1])
        #county.set_index('COUNTY')
        plc = pd.read_csv(dfs_plc_data_file,sep=',',header='infer',index_col=0)
        return afcars, county, plc

    def getSchool(self):
        df = pd.read_csv(school_discipline_data_file,sep=',',header='infer')
        df['county'] = df['DISTRICT_NAME'].str.split('#').str[0]
        df.set_index(['county','Beginning Year','End Year'],inplace=True)
        df.sort_index(level=['county','Beginning Year','End Year'],ascending=[1,1,1],inplace=True)
        dfii = [x.strip() for x in df.index.get_level_values(0).unique()]
        df.index.set_levels(dfii, level='county', inplace=True)
        #df.sortlevel()
        return df

    #fix this to dynamically load files
    #replaced with ori data DO NOT USE!
    def getIndexCrimes(self):
        obj1 = pd.read_csv(dci_index_crimes+'2016-index-cp-after.csv',sep=',',header='infer')
        obj1['year']='2016'
        obj2 = pd.read_csv(dci_index_crimes+'2015-index-cp-after.csv',sep=',',header='infer')
        obj2['year']='2015'
        obj3 = pd.read_csv(dci_index_crimes+'2014-index-cp-after.csv',sep=',',header='infer')
        obj3['year']='2014'
        #newobj = obj['2016']#+obj['2015']+obj['2014']
        newobj = pd.concat([obj1,obj2,obj3])
        return newobj
        #return pd.read_csv(dci_index_crimes+'2016-index-cp-after.csv',sep=',',header='infer')

    def getORIData(self):
        df = pd.read_csv(ori_data_file,sep=',',header='infer',index_col=[1,0])
        df.sort_index(level=[0,1],ascending=[1,1],inplace=True)
        df['total'] = df.sum(axis=1)
        #df.sortlevel
        return df
    
    def getPopulationData(self):
        df = pd.read_csv(population_data_file,sep=',',header='infer',index_col=[0])
        #df.sort_index(level=[0,1],ascending=[1,1],inplace=True)
        #df.sortlevel
        return df
    
    
    
    #function taken from countysnapshot, needs work
    def getStateTotalArrests(self,pop):
        total_arrests = {}
        for row,new_df in juvenile_arrests.groupby(level=[0,1]):
            if str(row[1]) not in total_arrests:
                total_arrests[str(row[1])] = new_df['total'][0]
            else:
                total_arrests[str(row[1])] = total_arrests[str(row[1])]+new_df['total'][0]
#         for key in state_totals:
#             print("Year:{}-Totals:{}-StatePop:{}".format(key,state_totals[key],pop[key].sum()))
        state_totals = []
        for key in total_arrests:
            #print("Year:{}-Totals:{}-StatePop:{}".format(key,state_totals[key],pop[key].sum()))
            state_totals.append({'year':int(key),'arrests':total_arrests[key],'population':pop[key].sum()})
        #print(str(state_totals))
        df = pd.DataFrame(state_totals)
        df.set_index(['year'],inplace=True)
        return df
    
    #DFS placement data
    def getJudicialPlacementData(self):
        df = pd.read_csv(placements_data_file,sep=',',header='infer',index_col=[0,2])
        df.sort_index(level=[0,1],ascending=[1,1],inplace=True)
        df=df.groupby(level=[0, 1]).sum()
        df['total'] = df.sum(axis=1)
        return df
    
    def getCourtCaseNumbersData(self):
        df = pd.read_csv(case_count_data_file,sep=',',header='infer',index_col=[0,1])
        df.sort_index(level=[0,1],ascending=[1,1],inplace=True)
        #df['total'] = df.sum(axis=1)
        #df=df.groupby(level=[0, 1]).sum()
        return df
    
    def getDemographicData(self):
        df = pd.read_csv(demographic_data_file,sep=',',header='infer',index_col=[0,1,2])
        df.sort_index(level=[1,1,1],ascending=[1,1,1],inplace=True)
        df.sortlevel()
        return df
    
dfu = DataFunctions()
#print(dfu.getPopulationData())

In [61]:
dfunct = DataFunctions()
df = dfu.getDemographicData()
idx = pd.IndexSlice
print(df.head(1))
#display(HTML(demographic_data.loc[idx[county,year_tup[0]:year_tup[1]]].to_html()))
county = 'Albany'
year_tup = (2012,2013)
df.loc[idx[county,year_tup[0]:year_tup[1],:],:].transpose()

# overview = dfunct.getOverview()
# dfs = dfunct.getDFS()#afcars, county, plc
# school = dfunct.getSchool()
# #school['County'] = school['DISTRICT_NAME'].str.split('#').str[0]
# index_crimes = dfunct.getIndexCrimes()#DO NOT USE THIS! replaced by ori data
# juvenile_arrests = dfunct.getORIData()


                       White  Black  American Indian  Asian  Total
County Year Age Range                                             
Albany 1990 0            386      5                3     12    406




County,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany,Albany
Year,2012,2012,2012,2012,2012,2012,2012,2012,2012,2012,...,2013,2013,2013,2013,2013,2013,2013,2013,2013,2013
Age Range,0,1,10,11,12,13,14,15,16,17,...,21 to 24,25 & over,3,4,5,6,7,8,9,Total
White,410,379,275,299,254,251,276,300,282,307,...,6760,18722,353,363,342,338,330,298,281,34861
Black,10,15,12,8,5,7,11,6,6,19,...,194,486,17,16,21,11,6,8,12,965
American Indian,7,2,7,4,6,3,2,4,4,4,...,110,233,4,8,9,7,5,6,7,473
Asian,25,15,11,11,5,6,11,14,10,7,...,219,763,5,11,9,13,8,7,7,1337
Total,452,411,305,322,270,267,300,324,302,337,...,7283,20204,379,398,381,369,349,319,307,37636


In [62]:
#set(school.index.levels[0])

In [63]:
#pd.core.strings.str_strip(school[0])
# school_indx = [x.strip() for x in school.index.get_level_values(0).unique()]
# school.index.set_levels(school_indx, level='county', inplace=True)
# school

In [64]:
# dfunct = DataFunctions()
# idx = pd.IndexSlice
# import numpy as np

# juvenile_arrests = dfunct.getORIData()
# pop = dfunct.getPopulationData()
# print(pop.head(3))

# print('*************')
# county = 'Albany'
# year_tup = (2014,2016)
# county_population = pop.loc[idx[county],:]
# print(juvenile_arrests.head(3))
# county_arrests = juvenile_arrests.loc[idx[county,year_tup[0]:year_tup[1]], :]
# print('*************')
# juvenile_arrests['total'] = juvenile_arrests.sum(axis=1)

# print('*************COUNTY ARRESTS')
# print(county_arrests)

# pops = []
# for row,new_df in juvenile_arrests.groupby(level=[0,1]):
#     #print(new_df)
#     print(type(row))
#     print(row[0])
#     #print(pop[row[0]][pop.get_loc(row[1])])
#     print(pop.loc[row[0],str(row[1])])
#     pops.append(pop.loc[row[0],str(row[1])])
# juvenile_arrests['population']=pops
# juvenile_arrests['rate'] = juvenile_arrests['total']/juvenile_arrests['population']
# print(juvenile_arrests.head(3))
