In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest
import patsy

In [None]:
crimes = pd.read_csv('~/data/CrimesSmall.csv')

In [None]:
crimes.index = crimes['ID'] # make the index the actual case number
# remove columns that are unnecessary for the analysis and would probably just slow our code down
crimes = crimes.drop(['Beat', 'FBI Code','Updated On', 'Case Number', 'Block', 'IUCR', 'ID', 'Description', 'Ward', 'X Coordinate', 'Y Coordinate', 'Latitude', 'Longitude'], axis=1)




In [None]:
crime_types = crimes['Primary Type'].unique().tolist()

### Begin cleaning data

1. Remove NaN values
2. standardize the crime type to get rid of the uppercase, standardize the non-criminal type
3. standardize the location description

There are actually no Nan values to remove! Government data is reliable I guess. The only NaN values are location, but we still know the district, so as long as we don't end up needing location later on, this is fine

In [None]:
to_drop = crimes[crimes.isnull().any(axis=1)]
to_drop;

In [None]:
def standardize_primary_type(string):
    # compile all non-criminal offenses into on label
    if string == 'NON-CRIMINAL (SUBJECT SPECIFIED)' or string == 'NON - CRIMINAL' or string == 'NON-CRIMINAL':
        return 'non-criminal'
    if string == 'OTHER OFFENSE':
        return 'other'
    # rename crim sexual assault to just sexual assault to make it easier to read
    if string == 'CRIM SEXUAL ASSAULT':
        return 'sexual assault'
    else:
        # everything else, make sure to lowercase it so we don't have to use caps lock lol
        return string.lower()
        

In [None]:
def standardize_location(string):
    if type(string) == str:
        string = string.lower()
        if 'airport' in string:
            return 'airport'
    return string

In [None]:
crimes['Primary Type'] = crimes['Primary Type'].apply(standardize_primary_type) # clean data 
crimes['location'] = crimes['Location Description'].apply(standardize_location)

In [None]:
# be careful if you run this one, only run it once it won't work the second time 
crimes = crimes.drop(['Location Description','Community Area'],axis=1)

In [None]:
crimes = crimes.rename(columns = {'Location': 'Coordinates'})

In [None]:
crimes['Primary Type'].value_counts().nlargest(3)

In [None]:
district_cache = {}
for district in districts:
    district_cache[district] = crimes[ crimes['District'] == district ]

In [None]:
crimes = crimes.rename(columns={'Primary Type': 'Type'})

In [None]:
def arrest_to_int(string):
    if string == True:
        return 1
    elif string == False:
        return 0
crimes['ArrestInts'] = crimes['Arrest'].apply(arrest_to_int)

In [None]:

plt.bar(crimes.Type.value_counts().index, crimes.Type.value_counts())
# rotate labels

In [None]:
crimes.Type.value_counts()

In [None]:
ind = (crimes.Type=='battery').values | (crimes.Type=='theft').values |  (crimes.Type=='narcotics').values |  (crimes.Type=='criminal damage').values 


In [None]:
crimes['District'] = crimes['District'].astype(str)

In [None]:
#binomial_model = smf.glm(formula='ArrestInts ~ Type + District', data=crimes.loc[ind,:] , family=sm.families.Binomial())
binomial_model = smf.glm(formula='ArrestInts ~ Type + District', data=crimes , family=sm.families.Binomial())
binomial_results = binomial_model.fit()
binomial_results.summary()