# Predicting Crime in US Communites
## Team member: Bibata Rabba Idi, Fatima Javid, JianHui (Jake) Li 

In [45]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
# Load data
df = pd.read_csv('crimedata.csv')
df.head()

Unnamed: 0,communityName,state,countyCode,communityCode,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,ViolentCrimesPerPop,nonViolPerPop
0,BerkeleyHeightstownship,NJ,39.0,5320.0,11980,3.1,1.37,91.78,6.5,1.88,...,14.0,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59
1,Marpletownship,PA,45.0,47616.0,23123,2.82,0.8,95.57,3.44,0.85,...,57.0,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95
2,Tigardcity,OR,,,29344,2.43,0.74,94.33,3.43,2.35,...,274.0,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51
3,Gloversvillecity,NY,35.0,29443.0,16656,2.4,1.7,97.35,0.5,0.7,...,225.0,1301.78,716.0,4142.56,47.0,271.93,,,306.64,
4,Bemidjicity,MN,7.0,5068.0,11245,2.76,0.53,89.16,1.17,0.52,...,91.0,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,,9988.79


In [3]:
# Shape of dataframe
df.shape

(2215, 146)

In [4]:
# Number of nulls from each column
df.isnull().sum()

communityName             0
state                     0
countyCode             1221
communityCode          1224
population                0
                       ... 
autoTheftPerPop           3
arsons                   91
arsonsPerPop             91
ViolentCrimesPerPop     221
nonViolPerPop            97
Length: 146, dtype: int64

In [5]:
# Total number of nulls
df.isnull().sum().sum()

44592

In [6]:
# Make a list of columns that contain nulls
cols_with_nulls = df.columns[df.isna().any()].tolist()
print(cols_with_nulls)

['countyCode', 'communityCode', 'OtherPerCap', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'PolicBudgPerPop', 'rapes', 'rapesPerPop', 'robberies', 'robbbPerPop', 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', 'nonViolPerPop']


In [7]:
# Show number of nulls for each column that contain nulls
df[cols_with_nulls].isnull().sum()

countyCode              1221
communityCode           1224
OtherPerCap                1
LemasSwornFT            1872
LemasSwFTPerPop         1872
LemasSwFTFieldOps       1872
LemasSwFTFieldPerPop    1872
LemasTotalReq           1872
LemasTotReqPerPop       1872
PolicReqPerOffic        1872
PolicPerPop             1872
RacialMatchCommPol      1872
PctPolicWhite           1872
PctPolicBlack           1872
PctPolicHisp            1872
PctPolicAsian           1872
PctPolicMinor           1872
OfficAssgnDrugUnits     1872
NumKindsDrugsSeiz       1872
PolicAveOTWorked        1872
PolicCars               1872
PolicOperBudg           1872
LemasPctPolicOnPatr     1872
LemasGangUnitDeploy     1872
PolicBudgPerPop         1872
rapes                    208
rapesPerPop              208
robberies                  1
robbbPerPop                1
assaults                  13
assaultPerPop             13
burglaries                 3
burglPerPop                3
larcenies                  3
larcPerPop    

In [8]:
# Drop columns that are mostly nulls
del cols_with_nulls[25:]
del cols_with_nulls[2]

print("Columns before dropping:", df.shape[1])
df.drop(cols_with_nulls, axis=1, inplace=True)
print("Columns after dropping:", df.shape[1])

Columns before dropping: 146
Columns after dropping: 122


In [9]:
# Make a list of columns that contain nulls again to make sure
cols_with_nulls = df.columns[df.isna().any()].tolist()
print(cols_with_nulls)

df[cols_with_nulls].isnull().sum()

['OtherPerCap', 'rapes', 'rapesPerPop', 'robberies', 'robbbPerPop', 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop', 'nonViolPerPop']


OtherPerCap              1
rapes                  208
rapesPerPop            208
robberies                1
robbbPerPop              1
assaults                13
assaultPerPop           13
burglaries               3
burglPerPop              3
larcenies                3
larcPerPop               3
autoTheft                3
autoTheftPerPop          3
arsons                  91
arsonsPerPop            91
ViolentCrimesPerPop    221
nonViolPerPop           97
dtype: int64

In [10]:
df.head(10)

Unnamed: 0,communityName,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,ViolentCrimesPerPop,nonViolPerPop
0,BerkeleyHeightstownship,NJ,11980,3.1,1.37,91.78,6.5,1.88,12.47,21.44,...,14.0,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59
1,Marpletownship,PA,23123,2.82,0.8,95.57,3.44,0.85,11.01,21.3,...,57.0,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95
2,Tigardcity,OR,29344,2.43,0.74,94.33,3.43,2.35,11.36,25.88,...,274.0,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51
3,Gloversvillecity,NY,16656,2.4,1.7,97.35,0.5,0.7,12.55,25.2,...,225.0,1301.78,716.0,4142.56,47.0,271.93,,,306.64,
4,Bemidjicity,MN,11245,2.76,0.53,89.16,1.17,0.52,24.46,40.53,...,91.0,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,,9988.79
5,Springfieldcity,MO,140494,2.45,2.51,95.65,0.9,0.95,18.09,32.89,...,2094.0,1386.46,7690.0,5091.64,454.0,300.6,134.0,88.72,442.95,6867.42
6,Norwoodtown,MA,28700,2.6,1.6,96.57,1.47,1.1,11.17,27.41,...,110.0,372.09,288.0,974.19,144.0,487.1,17.0,57.5,226.63,1890.88
7,Andersoncity,IN,59459,2.45,14.2,84.87,0.4,0.63,15.31,27.93,...,608.0,997.6,2250.0,3691.79,125.0,205.1,9.0,14.77,439.73,4909.26
8,Fargocity,ND,74111,2.46,0.35,97.11,1.25,0.73,16.64,35.16,...,425.0,532.66,3149.0,3946.71,206.0,258.18,8.0,10.03,115.31,4747.58
9,Wacocity,TX,103590,2.62,23.14,67.6,0.92,16.35,19.88,34.55,...,2397.0,2221.81,6121.0,5673.63,1070.0,991.8,18.0,16.68,1544.24,8903.93


In [11]:
# Fill in NaN with -999
df[cols_with_nulls] = df[cols_with_nulls].fillna(-999)

In [12]:
df[cols_with_nulls].isnull().sum()

OtherPerCap            0
rapes                  0
rapesPerPop            0
robberies              0
robbbPerPop            0
assaults               0
assaultPerPop          0
burglaries             0
burglPerPop            0
larcenies              0
larcPerPop             0
autoTheft              0
autoTheftPerPop        0
arsons                 0
arsonsPerPop           0
ViolentCrimesPerPop    0
nonViolPerPop          0
dtype: int64

In [13]:
# Check for duplicates
df.duplicated().sum()

0

In [14]:
print(df.columns.tolist())

['communityName', 'state', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumKidsBornNeverMar', 'PctKidsBornNeverMar', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'Pct

In [15]:
# Create a column for the sum of all crimes
df["total_crime"] = df["murders"] + df["rapes"] + df["robberies"] + df["assaults"] + df["burglaries"] + df["larcenies"] + df["autoTheft"] + df["arsons"]

In [40]:
df.head(10)

Unnamed: 0,communityName,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,...,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,ViolentCrimesPerPop,nonViolPerPop,total_crime
0,BerkeleyHeightstownship,NJ,11980,3.1,1.37,91.78,6.5,1.88,12.47,21.44,...,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59,175.0
1,Marpletownship,PA,23123,2.82,0.8,95.57,3.44,0.85,11.01,21.3,...,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95,490.0
2,Tigardcity,OR,29344,2.43,0.74,94.33,3.43,2.35,11.36,25.88,...,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51,2308.0
3,Gloversvillecity,NY,16656,2.4,1.7,97.35,0.5,0.7,12.55,25.2,...,1301.78,716.0,4142.56,47.0,271.93,-999.0,-999.0,306.64,-999.0,42.0
4,Bemidjicity,MN,11245,2.76,0.53,89.16,1.17,0.52,24.46,40.53,...,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,-999.0,9988.79,266.0
5,Springfieldcity,MO,140494,2.45,2.51,95.65,0.9,0.95,18.09,32.89,...,1386.46,7690.0,5091.64,454.0,300.6,134.0,88.72,442.95,6867.42,11041.0
6,Norwoodtown,MA,28700,2.6,1.6,96.57,1.47,1.1,11.17,27.41,...,372.09,288.0,974.19,144.0,487.1,17.0,57.5,226.63,1890.88,626.0
7,Andersoncity,IN,59459,2.45,14.2,84.87,0.4,0.63,15.31,27.93,...,997.6,2250.0,3691.79,125.0,205.1,9.0,14.77,439.73,4909.26,3260.0
8,Fargocity,ND,74111,2.46,0.35,97.11,1.25,0.73,16.64,35.16,...,532.66,3149.0,3946.71,206.0,258.18,8.0,10.03,115.31,4747.58,3880.0
9,Wacocity,TX,103590,2.62,23.14,67.6,0.92,16.35,19.88,34.55,...,2221.81,6121.0,5673.63,1070.0,991.8,18.0,16.68,1544.24,8903.93,11272.0


In [17]:
selected_features = df.columns.tolist()
print(selected_features)

['communityName', 'state', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumKidsBornNeverMar', 'PctKidsBornNeverMar', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'Pct

In [18]:
del selected_features[104:]
del selected_features[0:2]
del selected_features[2:6]
del selected_features[17:23]

In [19]:
print(selected_features)

['population', 'householdsize', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumKidsBornNeverMar', 'PctKidsBornNeverMar', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 

In [20]:
len(selected_features)

92

In [21]:
X = df[selected_features]
y = df["total_crime"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [23]:
model = RandomForestRegressor(random_state=2)
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

In [25]:
r_squared = metrics.r2_score(y_test, y_pred)
print('R-Squared Score:', r_squared)

mae = metrics.mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

R-Squared Score: 0.917152075421758
Mean Absolute Error: 819.2516930022573


In [26]:
feature_imp = pd.Series(model.feature_importances_, index=selected_features).sort_values(ascending=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(feature_imp)

numbUrban                0.240836
NumUnderPov              0.234937
population               0.149663
NumKidsBornNeverMar      0.112768
NumStreet                0.055227
NumImmig                 0.050250
NumInShelters            0.040661
HousVacant               0.030032
PctUsePubTrans           0.007150
PctLess9thGrade          0.004702
PopDens                  0.004642
TotalPctDiv              0.003828
PctSameState85           0.003152
PctNotSpeakEnglWell      0.003034
PctEmploy                0.002373
PctPersOwnOccup          0.002158
PctWOFullPlumb           0.002119
PctOccupMgmtProf         0.002118
PersPerFam               0.001985
PersPerOwnOccHous        0.001943
PctHousLess3BR           0.001909
PctRecentImmig           0.001909
agePct12t29              0.001809
PctWorkMomYoungKids      0.001737
PctLargHouseFam          0.001677
PersPerRentOccHous       0.001656
MedRent                  0.001608
PctBSorMore              0.001585
agePct16t24              0.001520
PctEmplManu   

In [None]:
list_of_features = feature_imp.index.tolist()

In [47]:
selected_features = list_of_features[:11]

In [48]:
X = df[selected_features]
y = df["total_crime"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
model = RandomForestRegressor(random_state=1)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)

In [49]:
params = {
    'criterion' : ['squared_error', 'absolute_error'],
    'max_depth': [2, 5, 7], 
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 10, 20],
         }

grid_search_cv =  GridSearchCV( 
    estimator = RandomForestRegressor(), 
    param_grid = params, 
    scoring = 'neg_mean_absolute_error')

grid_search_cv.fit(X_train, y_train)

print(grid_search_cv.best_params_)

model = grid_search_cv.best_estimator_

print(model)

{'criterion': 'absolute_error', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
RandomForestRegressor(criterion='absolute_error', max_depth=7)


In [53]:
y_pred = model.predict(X_test)

r_squared = metrics.r2_score(y_test, y_pred)
print('R-Squared Score:', r_squared)

mae = metrics.mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

R-Squared Score: 0.9450268203785896
Mean Absolute Error: 809.8956772009029


In [54]:
feature_imp = pd.Series(model.feature_importances_,index=selected_features).sort_values(ascending=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(feature_imp)

NumUnderPov            0.270335
population             0.249141
numbUrban              0.226576
NumKidsBornNeverMar    0.068923
HousVacant             0.051574
NumImmig               0.036785
NumInShelters          0.033048
NumStreet              0.023686
PopDens                0.018467
PctLess9thGrade        0.013213
PctUsePubTrans         0.008253
dtype: float64
