# DSO 510 PROJECT TEAM 3

1. Chinmayi Bengaluru Prakash
2. Daniel Strangio
3. Hemanth Mallagatta Ravishankar
4. Naveen Kumar Manjunatha
5. Sravanthi Kuchibhotla
6. Vicky Choi

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline
import matplotlib.colors as mcolors
import warnings
import math
warnings.filterwarnings('ignore')

import plotly.express as px

In [None]:
#Reading the dataset
df_raw = pd.read_csv("Crime Data_Nov2018_Present.csv")
df_raw.head()

In [None]:
df_raw.Year.value_counts()

In [None]:
df_raw = df_raw.dropna()

In [None]:
#Filtering for the required columns
columns = ['ID','Case Number','Date','Block','Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Year'
          ,'Latitude', 'Longitude']
df = df_raw[columns]

In [None]:
#Distribution of Crime Incidents by Crime Type

df1 = round((df["Primary Type"].value_counts(normalize = True)*100),2).sort_values(ascending = True)


plt.figure(figsize = [10,10])

plt.title('Distribution of Crime Incidents by Type', fontweight='bold')
plt.ylabel('Crime Type', fontweight='bold')
plt.xlabel('Percentage of Total', fontweight='bold')


cmap = mcolors.LinearSegmentedColormap.from_list("", ["green","yellow","red"])

plt.barh(df1.index,df1, color = cmap(df1.values/df1.values.max()))

In [None]:
#Based on the results categorizing the Crime type into Frequency buckets

c_high = ['THEFT','BATTERY','CRIMINAL DAMAGE','ASSAULT','DECEPTIVE PRACTICE','OTHER OFFENSE']
c_mid = ['MOTOR VEHICLE THEFT','NARCOTICS','BURGLARY','ROBBERY','WEAPONS VIOLATION','CRIMINAL TRESPASS']
c_combined = c_high + c_mid
c_all = list(df["Primary Type"].unique())
c_all

for element in c_combined:
    if element in c_all:
         c_all.remove(element)
c_low = c_all            

In [None]:
# Creating Frequency Column based on Crime type

df['Frequency']  = df['Primary Type']
for i in c_high:
    df['Frequency'].replace({i: "High"}, inplace=True)
for i in c_mid:
    df['Frequency'].replace({i: "Mid"}, inplace=True)
for i in c_low:
    df['Frequency'].replace({i: "Low"}, inplace=True)

In [None]:
#Creating the Severity variable based on the level of Punishment defined by the Law
#L1-2 = low severity, L3-4 = medium severity, L5+ = high severity
#https://pap.georgia.gov/sites/pap.georgia.gov/files/CSL-s_Post_1-1-2006_considerations.pdf


df['Severity'] = df['Primary Type']

sev_High = ["HUMAN TRAFFICKING", "KIDNAPPING", "ROBBERY", "PROSTITUTION", "CRIM SEXUAL ASSAULT", "CRIMINAL SEXUAL ASSAULT", "SEX OFFENSE", "OFFENSE INVOLVING CHILDREN", "ASSAULT", "BATTERY", ]
sev_Mid = ["NARCOTICS", "BURGLARY", "WEAPONS VIOLATION", "HOMICIDE", "ARSON" ]
sev_combined = sev_High + sev_Mid
sev_all = list(df["Primary Type"].unique())
sev_all

for element in sev_combined:
    if element in sev_all:
         sev_all.remove(element)
sev_Low = sev_all  


In [None]:
for i in sev_High:
    df['Severity'].replace({i: "High"}, inplace=True)
for i in sev_Mid:
    df['Severity'].replace({i: "Mid"}, inplace=True)
for i in sev_Low:
    df['Severity'].replace({i: "Low"}, inplace=True)

In [None]:
#Adding Day of the week column to the dataset
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

df['dayofweek'] = df['Date'].dt.strftime("%A")

In [None]:
#Creating Day-Night Variable

df['Date'] = pd.to_datetime(df['Date'])
df['Time'] = df['Date'].dt.time
df['Hour'] = df['Date'].dt.hour
df['DayNight'] = ['Day' if x >= 6 and x <= 18 else 'Night' for x in df['Hour']]

## EDA and Data Visualization

In [None]:
df.columns

In [None]:
#Crimes per day of the week

graph_data = pd.DataFrame(df.groupby('dayofweek')['Arrest'].sum()).reset_index()
graph_data

fig = px.bar(graph_data, x='dayofweek', y='Arrest',
             hover_data=['dayofweek', 'Arrest'], 
             #color='Arrest',
             labels=dict(dayofweek="Day of Week", Arrest="Number of Crime Incidents"), 
             height=400)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

fig.update_layout(
    title={
        'text': "Weekly Number of Crime Incidents",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()





In [None]:
# Arrests per day of the week 

graph_data1 = pd.DataFrame(df.groupby(['dayofweek','Arrest'])['ID'].count().sort_values(ascending = False)).reset_index()
graph_data1

fig = px.bar(graph_data1, x='dayofweek', y='ID',
             hover_data=['dayofweek', 'Arrest'], 
             color='Arrest',
             labels=dict(dayofweek="Day of Week", ID="Number of Arrests"), 
             height=400,
             title='Weekly Number of Arrests')

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

fig.update_layout(
    title={
        'text': "Weekly Number of Arrests",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

In [None]:
# Severity vs. Frequency

sns.histplot(binwidth=0.5, x="Frequency", hue="Severity", data=df, stat="count", multiple="stack")

In [None]:
df['Day']=df['Date'].dt.day

graph_data3 = pd.DataFrame(df.groupby('Day')['Arrest'].sum()).reset_index()

In [None]:
# Crimes per day of the month

df['Day']=df['Date'].dt.day

graph_data3 = pd.DataFrame(df.groupby('Day')['Arrest'].sum()).reset_index()

fig = px.bar(graph_data3, x='Day', y='Arrest',
             hover_data=['Day', 'Arrest'], 
             color='Arrest',
             labels=dict(Day="Day of Month", Arrest="Daily Number of Crime Incidents"), 
             height=400)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

fig.update_layout(
    title={
        'text': "Daily Number of Crime Incidents",
        'y':0.9,
        'x':0.4,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

In [None]:
# Number of Arrests by Frequency Type

pd.crosstab(df.Frequency,df.Arrest).plot(kind='bar')
plt.title('Number of Arrests for Frequency Type')
plt.xlabel('Frequency Type')
plt.ylabel('Number of Arrests')
plt.show()

In [None]:
# Arrests vs. Time of Day

sns.catplot(x = 'Arrest', kind ='count', hue='DayNight', data = df, color='orange')

In [None]:
# Arrest vs. Domestic or Non- Domestic

from statsmodels.graphics.mosaicplot import mosaic

df2 = df[['Domestic', 'Arrest']]
df2["Domestic"].replace({True: "Domestic", False: "Non-Domestic"}, inplace=True)
df2["Arrest"].replace({True: "Arrest", False: "No Arrest"}, inplace=True)

props={}
props[('Domestic','Arrest')]={'facecolor':'red', 'edgecolor':'white'}
props[('Domestic','No Arrest')]={'facecolor':'xkcd:aqua', 'edgecolor':'white'}
props[('Non-Domestic','Arrest')]={'facecolor':'red','edgecolor':'white'}
props[('Non-Domestic','No Arrest')]=        {'facecolor':'xkcd:aqua','edgecolor':'white'}

mosaic(df2, ['Domestic', 'Arrest'], title = 'Does arrest depend on type of incident?', properties=props)

plt.show()

In [None]:
df_raw['Date'] = pd.to_datetime(df_raw.Date)
df_raw['Date-Month'] = df_raw['Date'].dt.to_period('M')
df_raw['Month'] = df_raw.Date.dt.month

g2_data = pd.DataFrame(df_raw.groupby(['Month','Arrest'])['ID'].count()/3).reset_index()

sorterIndex = {1 : 'Jan' ,2 : 'Feb', 3 : 'Mar', 4 : 'Apr', 5 : 'May',6 : 'Jun' ,7 : 'Jul' ,8 : 'Aug',9 : 'Sep',10 : 'Oct',11 : 'Nov',12 : 'Dec'
              }

g2_data['month_name'] = g2_data['Month']
g2_data['month_name'] = g2_data['Month'].map(sorterIndex)

g2_data.sort_values('Month', inplace=True)
g2_data.head()

In [None]:
fig = px.bar(g2_data, x='month_name', y='ID', color = 'Arrest',
             hover_data=['month_name', 'ID'],
             labels={'month_name':'Month of Year','ID':'Average # Cases'}, 
             title= "Average # Cases by month, split by arrest",
             height=400)
fig.show()

In [None]:
df_raw['Date'] = pd.to_datetime(df_raw.Date)
df_raw['Date-Month'] = df_raw['Date'].dt.to_period('M')
df_raw['Month'] = df_raw.Date.dt.month

In [None]:
df_line = df_raw.groupby(['Year','Month','Date-Month'])[['ID']].count().reset_index().rename(columns = {'ID': 'Total Count'})

In [None]:
#Crimes by Month YoY
fig = px.line(df_line, x='Month', y="Total Count", color = 'Year', title='Crimes by Month YoY')
fig.show()

In [None]:
graph_data = pd.DataFrame(df.groupby(['dayofweek','Arrest'])['ID'].count()).reset_index().rename(columns = {'ID': 'Total Count'})
graph_data

In [None]:
sorterIndex = {'Friday': 5,
 'Monday': 1,
 'Saturday': 6,
 'Sunday': 0,
 'Thursday': 4,
 'Tuesday': 2,
 'Wednesday': 3}

In [None]:

graph_data

graph_data['Day_id'] = graph_data['dayofweek']
graph_data['Day_id'] = graph_data['Day_id'].map(sorterIndex)

graph_data.sort_values('Day_id', inplace=True)
graph_data.head()

#data_canada = data[data.country == 'Canada']
fig = px.bar(graph_data, x='dayofweek', y='Total Count', color = 'Arrest',
             hover_data=['dayofweek', 'Total Count'], 
             labels={'dayofweek':'Day of Week','Total Count':'# Cases'},
             title = '# Cases by Day of the Week', 
             height=400)
fig.show()

In [None]:
!pip install folium

In [None]:
import folium
from folium import plugins
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

m = folium.Map([41.8781, -87.6298], zoom_start=11)
m

In [None]:
# convert to (n, 2) nd-array format for heatmap
stationArr = df[['Latitude', 'Longitude']].to_numpy()


stationArr
#plot heatmap
m.add_children(plugins.HeatMap(stationArr, radius=15))
m

In [None]:
df.head()

## Hypothesis Testing

Hypothesis: Incidents that occur during the night are more prone to arrests or not

Let p1 be the proportion of arrests during night \
Let p2 be the proportion of arrests during day 

Null Hypothesis Ho: Proportion of arrests during night is lesser than or equal to proportion of arrests during daytime p1 <= p2 \
Alternative Hypothesis Ha: Proportion of arrests during night is more than proportion of arrests during daytime p1 > p2

In [None]:
df_night = df.loc[df["DayNight"]=='Night']

df_day = df.loc[df["DayNight"]=='Day']

In [None]:
p1 = round(df_night[df['Arrest'] == True]['Arrest'].count()/df_night['Arrest'].count(),5)
p2 = round(df_day[df['Arrest'] == True]['Arrest'].count()/df_day['Arrest'].count(),5)

print("p1:",p1)
print("p2:",p2)

In [None]:
n1 = df_night['Arrest'].count()
n2 = df_day['Arrest'].count()

print("n1:",n1)
print("n2:",n2)

In [None]:
z_obs = (p1 - p2) / math.sqrt(((p2*(1-p2))/n2) + ((p1*(1-p1))/n1))
print("z_obs: ",z_obs)

In [None]:
import scipy.stats as st


In [None]:
p_value = (1-st.norm.cdf(z_obs))
print("p_value: ",p_value)

Taking 95% Confidence Interval, since p-value is lesser than 0.05, we can reject null hypothesis and conclude that the proportion of incidents proning to arrests is more during the night.

### Hypothesis Testing for Arrests vs. Domestic/ Non-Domestic

Hypothesis: Probability of an arrest is higher in non-domestic incidents compared to domestic incidents

Let p1 be the proportion of arrests in non-domestic incidents
<br>
Let p2 be the proportion of arrests in domestic incidents

Null Hypothesis: There is no difference between the proportion of arrests in domestic and non-domestic incidents ( p1-p2 ) = 0
<br>
Alternate Hypothesis: The proportion of arrests in non-domestic incidents is greater than domestic incidents ( p1-p2 ) > 0




In [None]:
p1 = df2[(df2["Domestic"]=="Non-Domestic") & (df2["Arrest"]=='Arrest')]['Arrest'].count()/df2[(df2["Domestic"]=="Non-Domestic")]['Arrest'].count()
p2 = df2[(df2["Domestic"]=="Domestic") & (df2["Arrest"]=='Arrest')]['Arrest'].count()/df2[(df2["Domestic"]=="Domestic")]['Arrest'].count() 
n1 = df2[(df2["Domestic"]=="Non-Domestic")]['Arrest'].count()
n2 = df2[(df2["Domestic"]=="Domestic")]['Arrest'].count() 
print(f'Proportion of arrests in non-domestic incidents; p1 = {p1}')

In [None]:
z = (p1 - p2)/math.sqrt(((p1*(1-p1))/n1) + ((p2*(1-p2))/n2))

In [None]:
p_value = (1 - st.norm.cdf(z))

In [None]:
print(f'p-value: {p_value}')

Since p-value is almost 0 we can reject the null hypothesis and conclude that the Probablilty of an Arrest occuring in a non-domestic incident is greater than a domestic incident

## Model to predict whether the criminal incident will lead to an arrest or not:

In [None]:
#Creating Dummies for Categorical variables

df['Domestic'].replace({True: "Domestic", False: 'Non Domestic'}, inplace=True)
df_model = pd.get_dummies(df, columns= ['DayNight','Domestic','dayofweek','Frequency','Severity'])
df_model.columns

In [None]:
df_model.drop(['ID', 'Case Number', 'Date', 'Block', 'Primary Type', 'Description',
       'Location Description','Year', 'Latitude', 'Longitude',
       'Time', 'Hour','DayNight_Day','Domestic_Non Domestic', 'dayofweek_Sunday', 'Frequency_High', 
               'Severity_High' ], axis=1, inplace = True)
df_model.columns

In [None]:
#Defining Target variable as 1, 0
df_model['Arrest'].replace({True: 1, False: 0}, inplace=True)

df_model.head()
data_final = df_model

In [None]:
!pip install imbalanced-learn

In [None]:
# Over-sampling using SMOTE
# With our training data created, I’ll up-sample the no-subscription using the SMOTE algorithm(Synthetic Minority Oversampling Technique). 
# At a high level, SMOTE:
# Works by creating synthetic samples from the minor class (no-subscription) instead of creating copies.
# Randomly choosing one of the k-nearest-neighbors and using it to create a similar, but randomly tweaked, new observations.

X = data_final.loc[:, data_final.columns != 'Arrest']
y = data_final.loc[:, data_final.columns == 'Arrest']

from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=os.fit_resample(X_train, y_train)

os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['Arrest'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no Arrest in oversampled data",len(os_data_y[os_data_y['Arrest']==0]))
print("Number of Arrests",len(os_data_y[os_data_y['Arrest']==1]))
print("Proportion of no Arrest data in oversampled data is ",len(os_data_y[os_data_y['Arrest']==0])/len(os_data_X))
print("Proportion of Arrest data in oversampled data is ",len(os_data_y[os_data_y['Arrest']==1])/len(os_data_X))

Now we have a perfect balanced data! You may have noticed that I over-sampled only on the training data, because by oversampling only on the training data, none of the information in the test data is being used to create synthetic observations, therefore, no information will bleed from test data into the model training.

Recursive Feature Elimination
Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.

In [None]:
data_final_vars=data_final.columns.values.tolist()
y=['Arrest']
X=[i for i in data_final_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

The RFE has helped us select the following features used in the model

In [None]:
#Building the Logistic Regression Model to predict the Arrest happening

X=os_data_X
y=os_data_y['Arrest']

import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

The p-values for most of the variables are smaller than 0.05

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
#Predicting the test set results and calculating the accuracy

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

The result is telling us that we have 103344+54997 correct predictions and 15510+63954 incorrect predictions

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.

The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.

The support is the number of occurrences of each class in y_test.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).

## Model to predict the number of crime incidents per month

In [None]:
df.head()

In [None]:
#Adding month column to dataset

df['Month'] = df['Date'].dt.month

df.head()

df_criminal_incidents = df.groupby(['DayNight', 'Domestic', 'Frequency', 'Severity', 'Year', 'Month'])['ID'].count().reset_index().rename(columns={'ID':'Total_count'})
df_criminal_incidents

In [None]:
# Checking for null values
print(df_criminal_incidents.info())

# Checking for outliers
print(df_criminal_incidents.describe())

In [None]:
#Creating dummy variables for categorical variables

df_model = pd.get_dummies(df_criminal_incidents, columns= ['DayNight','Domestic','Frequency','Severity'])

In [None]:
#df_model.head()
data_final = df_model.drop(['Year', 'Month', 'Domestic_Non Domestic', 'Frequency_High', 'Severity_High', 'DayNight_Day'] ,axis = 1)
data_final.head()

In [None]:
# Building a linear model


from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(data_final, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
# Dividing the training data set into X and Y
y_train = df_train.pop('Total_count')
X_train = df_train

#Build a linear model

import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

lr_1.summary()

In [None]:
# Checking for the VIF values of the variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Creating a dataframe that will contain the names of all the feature variables and their VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Variance Inflation Factor or VIF is a quantitative value that says how much the feature variables are correlated with each other. It is an extremely important parameter to test our linear model.

In [None]:
import seaborn as sns
y_train_predicted = lr_1.predict(X_train_lm)
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_predicted), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)                         

In [None]:
df_train.head()

In [None]:
# Testing on Test data set
from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(data_final, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
y_test = df_test.pop('Total_count')
X_test = df_test

# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test)

# Making predictions
y_pred = lr_1.predict(X_test_new)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_true = y_test, y_pred = y_pred)

The R² value for the test data = 0.5907543414367202, which is pretty similar to the train data.

## Model to predict the number of crime incidents per month

In [None]:
df_arrests = df[df['Arrest'] == True].groupby(['DayNight', 'Domestic', 'Frequency', 'Severity', 'Year', 'Month'])['ID'].count().reset_index().rename(columns={'ID':'Total_arrests'})
df_arrests.head()

In [None]:
df_criminal_incidents.head()

In [None]:
df_arrest_model = df_arrests.merge(df_criminal_incidents, how='inner', on=['DayNight', 'Domestic', 'Frequency', 'Severity', 'Year', 'Month'])

In [None]:
df_model = pd.get_dummies(df_arrest_model, columns= ['DayNight','Domestic','Frequency','Severity'])
df_model.head()

In [None]:
df_model.corr()

In [None]:
#df_model.head()
data_final = df_model.drop(['Year', 'Month', 'Domestic_Non Domestic', 'Frequency_High', 'Severity_High', 'DayNight_Night'] ,axis = 1)
data_final.head()

In [None]:

from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(data_final, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
# Dividing the training data set into X and Y
y_train = df_train.pop('Total_arrests')
X_train = df_train

#Build a linear model

import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

lr_1.summary()

In [None]:
data_final_1 = data_final.drop(['DayNight_Day'] ,axis = 1)
data_final_1.head()

In [None]:

from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(data_final_1, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
# Dividing the training data set into X and Y
y_train = df_train.pop('Total_arrests')
X_train = df_train

#Build a linear model

import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

lr_1.summary()

In [None]:
# Checking for the VIF values of the variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Creating a dataframe that will contain the names of all the feature variables and their VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
import seaborn as sns
y_train_predicted = lr_1.predict(X_train_lm)
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_predicted), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)                         

In [None]:

from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(data_final_1, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
# Dividing the training data set into X and Y
y_test = df_train.pop('Total_arrests')
X_test = df_train

# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test)

# Making predictions
y_pred = lr_1.predict(X_test_new)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_true = y_test, y_pred = y_pred)

The R² value for the test data = 0.5019859072012338, which is pretty similar to the train data.