In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns
import statsmodels.api as sm

from sklearn import linear_model, preprocessing
from sklearn.model_selection import cross_val_score

import IPython.display as dsp

In [3]:
#import data
crime = pd.read_excel('data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2013.xls')

#Set column names to correct column and drop
crime.columns = crime.iloc[2]
crime.drop(2, inplace = True)

#Drop rows not associated with a city, this catches all footnotes
crime.dropna(subset = ["City"], inplace = True)
lst = crime.columns.get_loc('Arson3')
crime = crime.iloc[:,:lst + 1]

#Combine Legacy and Revised Rape definitions.
crime['Rape\n(legacy\ndefinition)2'].replace(np.nan, 0, inplace = True)
crime['Rape\n(revised\ndefinition)1'].replace(np.nan, 0, inplace = True)
crime['Rape'] = crime['Rape\n(legacy\ndefinition)2'] + crime['Rape\n(revised\ndefinition)1']

#Drop legacy and revised columns
crime.drop(['Rape\n(legacy\ndefinition)2', 'Rape\n(revised\ndefinition)1'], axis = 1, inplace = True)

#Rename verbose columns
crime.columns = crime.columns.str.replace('\n','_')
crime.rename(columns = {'Murder and_nonnegligent_manslaughter':'Murder', 'Larceny-_theft' : 'Larceny_theft',
                       'Arson3': 'Arson'}, inplace = True)

#Reset index
crime.index = range(len(crime))

#Check final results
print(crime.columns)
crime.head(5)

Index(['State', 'City', 'Population', 'Violent_crime', 'Murder', 'Robbery',
       'Aggravated_assault', 'Property_crime', 'Burglary', 'Larceny_theft',
       'Motor_vehicle_theft', 'Arson', 'Rape'],
      dtype='object', name=2)


2,State,City,Population,Violent_crime,Murder,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,Rape
0,ALABAMA,Abbeville,2645,11,1,2,7,63,21,39,3,,1
1,,Adamsville,4481,19,1,7,11,321,58,252,11,,0
2,,Addison,744,1,0,0,0,25,6,17,2,,1
3,,Alabaster,31170,44,0,11,31,640,70,544,26,,2
4,,Alexander City,14692,119,2,12,89,661,121,510,30,,16


In [4]:
#Fill in all missing values of state

#Create a pandas series to make things easier
states = crime.State
states = states.fillna('null') #Because for some reason, all NaNs are floats

#Sanity check
#print(states[0], states[1])
#print(type(states[1]))

#Loop through every entry
i = 0

for state in states:
    #print('Testing ', state)
    
    #Condition if not null, set name
    if state != 'null':
        #print('State name is now {}'.format(states[i]))
        #print(i)
        state_name = states[i]
    
    #If null, replace with last state name.
    else:
        #print('{} replaced with {}'.format(states[i], states[i - 1]))
        states[i] = states[i - 1]
    
    #advance index
    i += 1


#Sanity check 2
#print(states[0], states[1])
#print(states.unique())

crime['State'] = states

crime.head()

2,State,City,Population,Violent_crime,Murder,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,Rape
0,ALABAMA,Abbeville,2645,11,1,2,7,63,21,39,3,,1
1,ALABAMA,Adamsville,4481,19,1,7,11,321,58,252,11,,0
2,ALABAMA,Addison,744,1,0,0,0,25,6,17,2,,1
3,ALABAMA,Alabaster,31170,44,0,11,31,640,70,544,26,,2
4,ALABAMA,Alexander City,14692,119,2,12,89,661,121,510,30,,16


In [5]:
#Replace all remaining nan with 0
crime = crime.fillna(0)
crime.head()

2,State,City,Population,Violent_crime,Murder,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,Rape
0,ALABAMA,Abbeville,2645,11,1,2,7,63,21,39,3,0,1
1,ALABAMA,Adamsville,4481,19,1,7,11,321,58,252,11,0,0
2,ALABAMA,Addison,744,1,0,0,0,25,6,17,2,0,1
3,ALABAMA,Alabaster,31170,44,0,11,31,640,70,544,26,0,2
4,ALABAMA,Alexander City,14692,119,2,12,89,661,121,510,30,0,16


In [6]:
#Select features
X = crime[crime.columns[3:]]
X['intercept'] = 1 #statsmodel requires a constant intercept of 1
print(X.columns)

#Binary outcome, is the population greater than 10000
Y = np.where(crime.Population >= 10000, 1, 0)
print(Y[:10])

Index(['Violent_crime', 'Murder', 'Robbery', 'Aggravated_assault',
       'Property_crime', 'Burglary', 'Larceny_theft', 'Motor_vehicle_theft',
       'Arson', 'Rape', 'intercept'],
      dtype='object', name=2)
[0 0 0 1 1 0 0 1 0 0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
logit = sm.Logit(Y, X)
results = logit.fit()

print(results.summary())

Optimization terminated successfully.
         Current function value: 0.284978
         Iterations 17
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 9292
Model:                          Logit   Df Residuals:                     9281
Method:                           MLE   Df Model:                           10
Date:                Fri, 05 Apr 2019   Pseudo R-squ.:                  0.5637
Time:                        16:47:52   Log-Likelihood:                -2648.0
converged:                       True   LL-Null:                       -6069.8
                                        LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Violent_crime           0.0415      0.112      0.371      0.711      -0.178       0.261
M

In [13]:
#Predict over the entire dataset w/o cross-validation
prediction = np.where(results.predict(X) < 0.5, 0, 1)
pred_table = pd.crosstab(Y, prediction)
print('Confusion Matrix\n')
print(pred_table)

print('\nPercent accuracy:\n')
print((pred_table.iloc[0,0] + pred_table.iloc[1,1]) / 
      (pred_table.sum().sum()))

Confusion Matrix

col_0     0     1
row_0            
0      5633   317
1       793  2549

Percent accuracy:

0.8805424020662936


__Model Analysis__

Statsmodel's logistic regression classifier does not do too bad a job. All of the coefficients are quite small, so it remains to be seen if LASSO and Ridge regression will really have that big of an impact.

In [21]:
#Create Ridge and LASSO regression
ridge = linear_model.LogisticRegression('l1')
lasso = linear_model.LogisticRegression('l2')

ridge.fit(X, Y)
lasso.fit(X, Y)

ridge_pred = np.where(ridge.predict(X) < 0.5, 0, 1)
lasso_pred = np.where(lasso.predict(X) < 0.5, 0, 1)

#RidgeStats
ridge_pred_table = pd.crosstab(Y, ridge_pred)

print('Ridge Prediction Table:\n', ridge_pred_table)
print((ridge_pred_table.iloc[0,0] + ridge_pred_table.iloc[1,1]) / (ridge_pred_table.sum().sum()))

#LASSOStats
lasso_pred_table = pd.crosstab(Y, lasso_pred)

print('\n\nLASSO Prediction Table:\n', lasso_pred_table)
print((lasso_pred_table.iloc[0,0] + lasso_pred_table.iloc[1,1]) / (lasso_pred_table.sum().sum()))



Ridge Prediction Table:
 col_0     0     1
row_0            
0      5837   113
1      1312  2030
0.8466422729229445


LASSO Prediction Table:
 col_0     0     1
row_0            
0      5636   314
1       794  2548
0.8807576409814895




__Analysis__

The LASSO classifier seems to be more accurate than the ridge classifier.
