In [1]:


# functions used throughout analysis

from __future__ import division
from IPython.core.display import HTML
import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import mode

pd.set_option('display.max_colwidth', 1)



def loadFiles(src_dir, fnName):
    fn = src_dir + fnName
    df = pd.read_csv(fn, delimiter=',')
    return df

def getDfColumns(df):
    return list(df.columns.values)

def columnString(dfList):
    return ','.join(dfList)

def findSingleCells (df):
    crossMatrix = df.as_matrix()
    single = len([ row[i] for row in crossMatrix for i in range(len(row)) if row[i] == 1 ])
    totalcells  = len([ 1 for row in crossMatrix for i in range(len(row)) ])
    kplus = totalcells - single
    percentsingle = (single/totalcells )* 100
    maxCell  = np.amax(crossMatrix) 
    minCell = np.amin(crossMatrix)
    meanCell = np.mean(crossMatrix)
    medianCell =   np.median(crossMatrix)
    return single, kplus, totalcells, percentsingle, minCell, maxCell, meanCell, medianCell

def getCrossTab(grpField, fieldList, margin=False):
    crossTab= pd.crosstab(grpField, fieldList,  margins=margin)
    return crossTab

def getSingleCellInfo( datasetName, crossTab, grpFieldList,  fieldListString):
    single, kplus, totalcells, percentsingle, minCell, maxCell, meanCell, medianCell = findSingleCells(crossTab)
    reportInfo =  "<div> *********************** </br>"
    reportInfo = reportInfo +  "<h3>" + datasetName +  ":</br>" + grpFieldList  + " By " + fieldListString + "</h3>"
    
    reportInfo = reportInfo + "<p><b> Total of number of cells in " + datasetName + "- " + fieldListString + " By " + grpFieldList +": " + str(totalcells) + "</br>" 
    reportInfo = reportInfo + "Number of Single Cells in " + datasetName + "- " + fieldListString  + " By " + grpFieldList +": " + str(single)+ "</br>"
    reportInfo = reportInfo + "Percentage of Single cells in "  + datasetName+  "- " + fieldListString +" By " + grpFieldList +": " + str( percentsingle) + "%" +"</b></br></p>"

    reportInfo = reportInfo +  "<br> Min cell value in crossTabs: " + str(minCell) + "</br>" 
    reportInfo = reportInfo + "Max cell value in crossTabs: " + str(maxCell)+ "</br>" 
    reportInfo = reportInfo + "Mean cell value in crossTabs " + str(meanCell)+ "</br>" 
    reportInfo = reportInfo + "Median cell value in crossTabs: " + str(medianCell)+ "</br>" 
    reportInfo =  HTML( reportInfo )
    return reportInfo 
    
    
def makeOutput(datasetName, src_dir, fnName):
    df = loadFiles(src_dir, fnName) 
    df_column_names =  getDfColumns(df)
    titleStuff = "<H1>**************************************</br>" + "Dataset: " + datasetName  + "</br>"
    titleStuff  = titleStuff + "<p> Columns: "+":</p>"
    titleStuff = titleStuff + "<p>" + ", ".join(df_column_names) + "</p></H1>"
    titleStuff = HTML(titleStuff)
    return df, titleStuff

def makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName):
    crossTab = getCrossTab( grpField, fieldList)
    single_cell_info = getSingleCellInfo( datasetName, crossTab, grpFieldList, fieldListString)
    return crossTab, single_cell_info



In [2]:

datasetName = "Juvenile Probation Juvenile Hall Admissions"
src_dir = '/home/ubuntu/workspace/source_data/'
fnName = 'Juvenile_Probation_Juvenile_Hall_Admissions.csv'
        
grpFieldList = 'Detention Year'
df, titleStuff = makeOutput(datasetName, src_dir , fnName )
titleStuff

In [3]:
grpField = [ df['Detention Year']]
fieldListString = 'Ethnicity,Gender'
fieldList = [ df['Ethnicity'],df['Gender']]
crossBy, single_cell_info = makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName)
single_cell_info

In [4]:
crossBy

Ethnicity,african american,african american,asian,asian,hispanic,hispanic,other,other,white,white
Gender,female,male,female,male,female,male,female,male,female,male
Detention Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2011,144,502,15,98,52,209,1,8,13,31
2012,122,389,17,64,32,186,0,17,7,36
2013,108,351,18,77,40,147,5,6,10,37
2014,102,317,5,59,32,154,0,4,6,19
2015,94,319,7,51,39,126,0,8,11,28
2016,54,184,5,19,20,49,0,8,5,6


In [5]:
grpField = [ df['Detention Year']]
fieldListString = 'Ethnicity,Gender,Age At Detention'
fieldList = [ df['Ethnicity'],df['Gender'],df['Age At Detention']]
crossBy, single_cell_info = makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName)
single_cell_info

In [6]:
crossBy

Ethnicity,african american,african american,african american,african american,african american,african american,asian,asian,asian,asian,...,hispanic,hispanic,other,other,other,white,white,white,white,white
Gender,female,female,female,male,male,male,female,female,female,male,...,male,male,female,male,male,female,female,female,male,male
Age At Detention,13-17,18+,<=12,13-17,18+,<=12,13-17,18+,<=12,13-17,...,18+,<=12,13-17,13-17,18+,13-17,18+,<=12,13-17,18+
Detention Year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2011,134,7,3,479,14,9,14,0,1,93,...,26,3,1,8,0,12,1,0,30,1
2012,107,10,5,356,22,11,17,0,0,61,...,29,2,0,16,1,6,0,1,31,5
2013,95,11,2,303,23,25,18,0,0,68,...,12,1,5,6,0,9,1,0,37,0
2014,88,12,2,283,21,13,4,1,0,56,...,24,0,0,4,0,6,0,0,16,3
2015,86,8,0,277,28,14,6,1,0,48,...,19,2,0,6,2,10,1,0,27,1
2016,53,0,1,162,17,5,5,0,0,19,...,11,1,0,5,3,4,1,0,5,1


In [7]:

datasetName = "Juvenile_Probation_Juvenile_Hall_Daily_Population"
src_dir = '/home/ubuntu/workspace/source_data/'
fnName = 'Juvenile_Probation_Juvenile_Hall_Daily_Population.csv'
        
grpFieldList = 'Ethnicity'
df, titleStuff = makeOutput(datasetName, src_dir , fnName )
titleStuff

In [8]:
grpField = [ df['Ethnicity']]
fieldListString = 'Detention Year, Release Year'
fieldList = [ df['Detention Year'],df['Release Year']]
crossBy, single_cell_info = makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName)
single_cell_info

In [9]:
crossBy

Detention Year,2009,2010,2011,2011,2011,2012,2012,2012,2013,2013,2014,2014,2015,2015,2016
Release Year,2011,2011,2011,2012,2013,2012,2013,2014,2013,2014,2014,2015,2015,2016,2016
Ethnicity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
african american,0,42,598,47,1,472,38,1,431,28,381,38,392,21,238
asian,0,1,58,2,0,27,2,0,46,2,29,2,22,2,2
hispanic,1,31,245,14,2,199,18,1,181,6,171,15,151,14,69
other,0,6,57,5,0,59,10,0,53,5,36,1,37,5,30
white,0,2,40,4,0,40,3,0,42,5,24,1,36,3,11


In [10]:

datasetName = "Juvenile_Probation_Juvenile_Hall_Releases"
src_dir = '/home/ubuntu/workspace/source_data/'
fnName = 'Juvenile_Probation_Juvenile_Hall_Releases.csv'
        
grpFieldList = 'Detention Year'
df, titleStuff = makeOutput(datasetName, src_dir , fnName )
titleStuff

In [11]:
grpField = [ df['Detention Year']]
fieldListString = 'Age At Release, Length of Stay In Juvenile Hall Custody'
fieldList = [ df['Age At Release'],df['Length of Stay In Juvenile Hall Custody']]
crossBy, single_cell_info = makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName)
single_cell_info

In [12]:
crossBy

Age At Release,13-17,13-17,13-17,13-17,13-17,13-17,13-17,13-17,13-17,13-17,...,<=12,<=12,<=12,<=12,<=12,<=12,<=12,<=12,<=12,<=12
Length of Stay In Juvenile Hall Custody,1,2,3,4,5,6,7,8,9,10,...,27,28,29,31,40,41,47,52,56,71
Detention Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010,0,0,0,0,0,0,1,0,1,3,...,0,0,0,0,0,0,0,0,0,0
2011,31,55,89,35,43,37,25,45,39,22,...,0,0,0,0,0,0,0,0,1,0
2012,22,28,65,30,30,37,31,29,31,15,...,0,1,0,1,0,0,0,0,0,0
2013,24,43,52,38,35,24,18,14,19,18,...,1,0,1,0,0,0,1,1,0,1
2014,14,37,52,25,23,24,10,13,8,6,...,0,0,0,0,1,0,0,1,0,0
2015,21,31,65,32,27,14,16,20,19,14,...,0,0,0,0,0,1,0,0,0,0
2016,4,24,49,10,19,12,6,2,7,10,...,0,0,0,0,0,0,0,0,0,0


In [13]:

datasetName = "Juvenile_Probation_Log_Cabin_Ranch_Admissions"
src_dir = '/home/ubuntu/workspace/source_data/'
fnName = 'Juvenile_Probation_Log_Cabin_Ranch_Admissions.csv'
        
grpFieldList = 'Log Cabin Ranch Entry Year'
df, titleStuff = makeOutput(datasetName, src_dir , fnName )
titleStuff

In [14]:
grpField = [ df['Log Cabin Ranch Entry Year']]
fieldListString = 'Age at Log Cabin Entry Date, Ethnicity'
fieldList = [ df['Age at Log Cabin Entry Date'],df['Ethnicity']]
crossBy, single_cell_info = makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName)
single_cell_info

In [15]:
crossBy

Age at Log Cabin Entry Date,13-17,13-17,13-17,13-17,13-17,18+,18+,18+,18+
Ethnicity,african american,asian,hispanic,other,white,african american,asian,hispanic,other
Log Cabin Ranch Entry Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2011,17,7,3,0,1,1,0,0,0
2012,24,3,2,0,1,1,0,1,0
2013,11,3,6,0,0,0,1,0,1
2014,7,2,3,0,0,2,0,1,0
2015,8,2,5,0,0,4,3,4,0
2016,2,2,0,1,0,1,0,0,0


In [16]:

datasetName = "Juvenile_Probation_Log_Cabin_Ranch_Releases"
src_dir = '/home/ubuntu/workspace/source_data/'
fnName = 'Juvenile_Probation_Log_Cabin_Ranch_Releases.csv'
        
grpFieldList = 'Length of Stay In Log Cabin Ranch Placement'
df, titleStuff = makeOutput(datasetName, src_dir , fnName )
titleStuff

In [17]:
grpField = [ df['Length of Stay In Log Cabin Ranch Placement']]
fieldListString = 'Log Cabin Ranch Entry Year, Age of Youth when released from LCR'
fieldList = [ df['Log Cabin Ranch Entry Year'],df['Age of Youth when released from LCR']]
crossBy, single_cell_info = makeCrossTabInfo( grpField, grpFieldList, fieldList, fieldListString,  datasetName)
single_cell_info

In [18]:
crossBy

Log Cabin Ranch Entry Year,2009,2010,2010,2011,2011,2012,2012,2013,2013,2014,2014,2015,2015,2016
Age of Youth when released from LCR,18+,13-17,18+,13-17,18+,13-17,18+,13-17,18+,13-17,18+,13-17,18+,13-17
Length of Stay In Log Cabin Ranch Placement,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
10,0,0,0,0,0,0,0,1,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,0,0,1,0
46,0,0,0,0,0,0,0,0,0,0,0,0,1,0
49,0,0,1,0,0,0,0,0,0,0,0,0,0,0
57,0,0,0,0,0,1,0,0,0,0,0,0,0,0
58,0,0,0,0,0,0,0,0,0,0,0,0,0,1
64,0,0,0,1,0,0,0,0,0,0,0,0,0,0
86,0,0,0,0,0,0,0,0,0,0,0,1,0,0
92,0,0,0,1,0,0,0,0,0,0,0,0,0,0
94,0,0,0,0,0,1,0,0,0,0,0,0,0,0
