# Predicting the non-violent crime rate (total number of non-violent crimes per 100K population)

In [1]:
#installing dependencies 

!pip install Seaborn
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt




# Data Loading and Preprocessing 

Loading data and cleaning it

In [2]:
#Google drive mount and importing data 
from google.colab import drive
drive.mount('/content/drive')

# load dataset
df = pd.read_csv('/content/drive/MyDrive/1.ASP2022/crime_ml.csv')

# drop unnecessary columns with string values 

df.drop(columns = ['communityname', 'state'], inplace=True, errors='raise')

#resetting indexes 
df = df.reset_index()


#finding out the number of null values in the dataset: I can't use mean or median here to replace them because of their nature  
print(df.isnull().sum())

#fill NaN values with 0 to let the machine know that these are missing values anyway 
df.fillna(0, inplace=True)

#checking again if missin values are removed 
print(df.isnull().sum())

# summarize shape
print(df.shape)
# summarize first few lines
df.head()
#checks
df.describe()


  


Mounted at /content/drive
index                     0
countycode             1221
communitycode          1224
fold                      0
population                0
                       ... 
autotheftperpop           3
arsons                   91
arsonsperpop             91
violentcrimesperpop     221
nonviolperpop            97
Length: 146, dtype: int64
index                  0
countycode             0
communitycode          0
fold                   0
population             0
                      ..
autotheftperpop        0
arsons                 0
arsonsperpop           0
violentcrimesperpop    0
nonviolperpop          0
Length: 146, dtype: int64
(2215, 146)


Unnamed: 0,index,countycode,communitycode,fold,population,householdsize,racepctblack,racepctwhite,racepctasian,racepcthisp,...,burglaries,burglperpop,larcenies,larcperpop,autotheft,autotheftperpop,arsons,arsonsperpop,violentcrimesperpop,nonviolperpop
count,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,...,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0
mean,1107.0,29.432957,20226.802709,5.494357,53117.98,2.707327,9.335102,83.979819,2.670203,7.950176,...,760.205869,1032.030523,2134.734086,3368.41078,515.992777,473.323689,29.637923,30.832695,530.304005,4693.298483
std,639.559744,85.392208,28188.955806,2.872924,204620.3,0.33412,14.247156,16.41908,4.473843,14.589832,...,3109.719923,763.783732,7595.829308,1904.074527,3256.011544,504.625282,176.491345,38.952603,609.438219,2861.188528
min,0.0,0.0,0.0,1.0,10005.0,1.6,0.0,2.68,0.03,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,553.5,0.0,0.0,3.0,14366.0,2.5,0.86,76.319999,0.62,0.93,...,94.5,510.06999,391.5,2035.6,30.0,156.55,1.0,6.46,113.13,2725.99
50%,1107.0,0.0,0.0,5.0,22792.0,2.66,2.87,90.349998,1.23,2.18,...,205.0,821.65997,745.0,3077.53,75.0,301.54999,5.0,19.75,316.54999,4277.3101
75%,1660.5,23.0,41705.0,8.0,43024.0,2.85,11.145,96.225002,2.67,7.81,...,507.5,1348.67,1673.0,4334.67505,231.5,589.225005,15.0,41.435,722.0,6100.8699
max,2214.0,840.0,94597.0,10.0,7322564.0,5.28,96.669998,99.629997,57.459999,95.290001,...,99207.0,11881.02,235132.0,25910.551,112464.0,4968.5898,5119.0,436.37,4877.0601,27119.76


Creating Arrays for the Features and the Response Variable

In [3]:
#creating an object of the target variable 
target_column= ['nonviolperpop']

#take all other features and exclude the target variable
predictors = list(set(list(df.columns))-set(target_column))

print(predictors)

#normalising the predictors because they might have significant different values
#here, normalization is done via scaling of the predictors between 0 and 1. 
df[predictors] = df[predictors]/df[predictors].max()

#finding out again if the number of null values in the dataset: I can't use mean or median here to replace them because of their nature  
print(df.isnull().sum())

#Remove missing values 
#df.fillna(0, inplace=True)

#checking again 
#print(df.isnull().sum())

#displaying the summary of the normalized data
df.describe()


['lemastotreqperpop', 'persperfam', 'pctkidsbornnevermar', 'pctyoungkids2par', 'pctpolicwhite', 'arsonsperpop', 'arsons', 'robberies', 'medowncostpctinc', 'communitycode', 'lemasgangunitdeploy', 'pcthousownocc', 'agepct12t21', 'violentcrimesperpop', 'pctlarghouseoccup', 'pctwsocsec', 'otherpercap', 'malepctnevmarr', 'racialmatchcommpol', 'landarea', 'numburban', 'rentmedian', 'assaultperpop', 'blackpercap', 'malepctdivorce', 'burglperpop', 'pctsamecity85', 'pctworkmom', 'assaults', 'pctfam2par', 'pctpersownoccup', 'pctemplmanu', 'policoperbudg', 'pctsamehouse85', 'percapinc', 'pcthousoccup', 'lemasswftfieldperpop', 'racepcthisp', 'burglaries', 'lemaspctpoliconpatr', 'pcthousnophone', 'lemasswornft', 'lemasswftperpop', 'policreqperoffic', 'medfaminc', 'pctspeakenglonly', 'persperoccuphous', 'ownoccmedval', 'autotheft', 'numinshelters', 'agepct65up', 'femalepctdiv', 'hisppercap', 'pctworkmomyoungkids', 'pctrecentimmig', 'pctvacantboarded', 'pctpolicblack', 'pctpolicminor', 'pctlarghousef

Unnamed: 0,index,countycode,communitycode,fold,population,householdsize,racepctblack,racepctwhite,racepctasian,racepcthisp,...,burglaries,burglperpop,larcenies,larcperpop,autotheft,autotheftperpop,arsons,arsonsperpop,violentcrimesperpop,nonviolperpop
count,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,...,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0
mean,0.5,0.035039,0.213821,0.549436,0.007254,0.512751,0.096567,0.842917,0.046471,0.083431,...,0.007663,0.086864,0.009079,0.130002,0.004588,0.095263,0.00579,0.070657,0.108734,4693.298483
std,0.288871,0.101657,0.29799,0.287292,0.027944,0.06328,0.147379,0.164801,0.07786,0.15311,...,0.031346,0.064286,0.032305,0.073486,0.028952,0.101563,0.034478,0.089265,0.12496,2861.188528
min,0.0,0.0,0.0,0.1,0.001366,0.30303,0.0,0.0269,0.000522,0.001259,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.0,0.3,0.001962,0.473485,0.008896,0.766034,0.01079,0.00976,...,0.000953,0.042931,0.001665,0.078563,0.000267,0.031508,0.000195,0.014804,0.023196,2725.99
50%,0.5,0.0,0.0,0.5,0.003113,0.503788,0.029689,0.906855,0.021406,0.022878,...,0.002066,0.069157,0.003168,0.118775,0.000667,0.060691,0.000977,0.04526,0.064906,4277.3101
75%,0.75,0.027381,0.44087,0.8,0.005876,0.539773,0.115289,0.965824,0.046467,0.08196,...,0.005116,0.113515,0.007115,0.167294,0.002058,0.11859,0.00293,0.094954,0.14804,6100.8699
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,27119.76


K Fold analysis: splitting data into a training set and a testing set 

In [4]:
#splitting data 

X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

print(X_train.shape); print(X_test.shape)


(1550, 145)
(665, 145)


# The models

# LASSO ( Least Absolute Shrinkage and Selection Operator)

In [5]:
### ON THE TRAINING SET##

#Setting the alpha value 
model_lasso = Lasso(alpha=0.01)

#fitting the model to the training data
model_lasso.fit(X_train, y_train) 

#the actual prediction process 
pred_train_lasso= model_lasso.predict(X_train)

#evaluation metrics: R-sqaured and RMSE of course!
print('RMSE for training:', np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print('R-Squared for training', r2_score(y_train, pred_train_lasso))

### ON THE TESTING SET##

#the actual prediction process 
pred_test_lasso= model_lasso.predict(X_test)

#evaluation metrics: R-sqaured and RMSE of course!
print('RMSE for testing:',np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print('R-Squared for testing', r2_score(y_test, pred_test_lasso))


RMSE for training: 946.5026853670738
R-Squared for training 0.8856709777649749
RMSE for testing: 1273.6227599179683
R-Squared for testing 0.8194703995089188


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


# Ridge Regression 

In [6]:
### ON THE TRAINING SET##

#Setting the alpha value 
rr = Ridge(alpha=0.01)

#fitting the model to the training data
rr.fit(X_train, y_train) 

#the actual prediction process 
pred_train_rr= rr.predict(X_train)


#evaluation metrics: R-sqaured and RMSE of course!
print( 'RMSE for training:', np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print( 'R-Squared for training', r2_score(y_train, pred_train_rr))


### ON THE TESTING SET##

#the actual prediction process 
pred_test_rr= rr.predict(X_test)


#evaluation metrics: R-sqaured and RMSE of course!
print('RMSE for testing:',np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print('R-Squared for testing', r2_score(y_test, pred_test_rr))

RMSE for training: 953.4764907319002
R-Squared for training 0.8839800250950716
RMSE for testing: 1187.4263073582624
R-Squared for testing 0.8430793381520818


# ElasticNet Regression 

In [7]:
#Check above comments in other methods, it's pretty the same

#For the training set 

model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print('RMSE for training:', np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print('R-Squared for training:', r2_score(y_train, pred_train_enet))

# For the testing set 

pred_test_enet= model_enet.predict(X_test)
print('RMSE for testing:', np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print('R-Squared for testing:', r2_score(y_test, pred_test_enet))



RMSE for training: 1387.5969024573192
R-Squared for training: 0.7542806081382679
RMSE for testing: 1585.8672342961765
R-Squared for testing: 0.7201015775630772


# Using K-fold regularisation parameters for each model

LASSO

In [8]:
from sklearn.linear_model import LassoCV

# Lasso with 5 fold cross-validation
model = LassoCV(cv=5, random_state=0, max_iter=10000)

# Fit model
model.fit(X_train, y_train)

LassoCV(cv=5, max_iter=10000, random_state=0)

# Here, this will show best value of penalization chosen by cross validation:

model.alpha_

  y = column_or_1d(y, warn=True)


2.1805591756714717

In [9]:
#Now predicting with the best alpha 

# Set best alpha
lasso_best = Lasso(alpha=model.alpha_)
lasso_best.fit(X_train, y_train)

#Show model coefficients and names:

#print(list(zip(lasso_best.coef_, X)))

#the actual prediction 
print('R squared training set', round(lasso_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(lasso_best.score(X_test, y_test)*100, 2))

R squared training set 86.61
R squared test set 86.15


Ridge Regression

In [10]:
from sklearn.linear_model import RidgeCV

# Ridge with 5 fold cross-validation
model = RidgeCV(cv=5)

# Fit model
model.fit(X_train, y_train)

RidgeCV(cv=5)

# Here, this will show best value of penalization chosen by cross validation:

model.alpha_

0.1

In [11]:
#Now predicting with the best alpha 

# Set best alpha
ridge_best = Ridge(alpha=model.alpha_)
ridge_best.fit(X_train, y_train)

#Show model coefficients and names:

#print(list(zip(lasso_best.coef_, X)))

#the actual prediction 
print('R squared training set', round(ridge_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(ridge_best.score(X_test, y_test)*100, 2))

R squared training set 87.9
R squared test set 85.61


Elastic Net Regression

In [12]:
from sklearn.linear_model import ElasticNetCV

# Lasso with 5 fold cross-validation
model = ElasticNetCV(cv=5, random_state=0, max_iter=10000)

# Fit model
model.fit(X_train, y_train)

ElasticNetCV(cv=5, max_iter=10000, random_state=0)

# Here, this will show best value of penalization chosen by cross validation:

model.alpha_

  y = column_or_1d(y, warn=True)


0.5014226332213989

In [13]:
#Now predicting with the best alpha 

# Set best alpha
ElasticNet_best = ElasticNet(alpha=model.alpha_)
ElasticNet_best.fit(X_train, y_train)

#Show model coefficients and names:

#print(list(zip(lasso_best.coef_, X)))

#the actual prediction 
print('R squared training set', round(ElasticNet_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(ElasticNet_best.score(X_test, y_test)*100, 2))

R squared training set 38.31
R squared test set 33.92
