In [29]:
import pandas as pd
import numpy as np
import folium
from folium import plugins
from folium.plugins import MarkerCluster
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle


In [17]:
df = pd.read_csv('train.csv')
df["Dates"] = pd.to_datetime(df["Dates"], format="%Y-%m-%d %H:%M:%S")
df["Hour"] = df["Dates"].map(lambda x: x.hour)
df["Month"] = df["Dates"].map(lambda x: x.month)
df["Year"] = df["Dates"].map(lambda x: x.year)
df.drop("Address", axis=1, inplace=True)
df.drop("Descript", axis=1, inplace=True)
df.drop("Resolution", axis=1, inplace=True)
df.drop("Dates", axis=1, inplace=True)
df.drop("X", axis=1, inplace=True)
df.drop("Y", axis=1, inplace=True)
df


Unnamed: 0,Category,DayOfWeek,PdDistrict,Hour,Month,Year
0,WARRANTS,Wednesday,NORTHERN,23,5,2015
1,OTHER OFFENSES,Wednesday,NORTHERN,23,5,2015
2,OTHER OFFENSES,Wednesday,NORTHERN,23,5,2015
3,LARCENY/THEFT,Wednesday,NORTHERN,23,5,2015
4,LARCENY/THEFT,Wednesday,PARK,23,5,2015
...,...,...,...,...,...,...
878044,ROBBERY,Monday,TARAVAL,0,1,2003
878045,LARCENY/THEFT,Monday,INGLESIDE,0,1,2003
878046,LARCENY/THEFT,Monday,SOUTHERN,0,1,2003
878047,VANDALISM,Monday,SOUTHERN,0,1,2003


In [18]:
df.isnull().sum()

Category      0
DayOfWeek     0
PdDistrict    0
Hour          0
Month         0
Year          0
dtype: int64

In [19]:
def encode_labels(dfx):
    factor = pd.factorize(dfx['Category'])
    return factor

factor = encode_labels(df)
Ytrain = factor[0]
labels= list(factor[1])
labels

['WARRANTS',
 'OTHER OFFENSES',
 'LARCENY/THEFT',
 'VEHICLE THEFT',
 'VANDALISM',
 'NON-CRIMINAL',
 'ROBBERY',
 'ASSAULT',
 'WEAPON LAWS',
 'BURGLARY',
 'SUSPICIOUS OCC',
 'DRUNKENNESS',
 'FORGERY/COUNTERFEITING',
 'DRUG/NARCOTIC',
 'STOLEN PROPERTY',
 'SECONDARY CODES',
 'TRESPASS',
 'MISSING PERSON',
 'FRAUD',
 'KIDNAPPING',
 'RUNAWAY',
 'DRIVING UNDER THE INFLUENCE',
 'SEX OFFENSES FORCIBLE',
 'PROSTITUTION',
 'DISORDERLY CONDUCT',
 'ARSON',
 'FAMILY OFFENSES',
 'LIQUOR LAWS',
 'BRIBERY',
 'EMBEZZLEMENT',
 'SUICIDE',
 'LOITERING',
 'SEX OFFENSES NON FORCIBLE',
 'EXTORTION',
 'GAMBLING',
 'BAD CHECKS',
 'TREA',
 'RECOVERED VEHICLE',
 'PORNOGRAPHY/OBSCENE MAT']

In [20]:
df.Category = list(factor[0])
df

Unnamed: 0,Category,DayOfWeek,PdDistrict,Hour,Month,Year
0,0,Wednesday,NORTHERN,23,5,2015
1,1,Wednesday,NORTHERN,23,5,2015
2,1,Wednesday,NORTHERN,23,5,2015
3,2,Wednesday,NORTHERN,23,5,2015
4,2,Wednesday,PARK,23,5,2015
...,...,...,...,...,...,...
878044,6,Monday,TARAVAL,0,1,2003
878045,2,Monday,INGLESIDE,0,1,2003
878046,2,Monday,SOUTHERN,0,1,2003
878047,4,Monday,SOUTHERN,0,1,2003


In [21]:
Xtrain = pd.get_dummies(df[['DayOfWeek', 'PdDistrict']])
Xtrain


Unnamed: 0,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
878045,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
878046,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
878047,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [23]:
Xtrain = Xtrain.join(df.Hour).join(df.Month).join(df.Year).join(df.Category)
Xtrain.columns

Index(['DayOfWeek_Friday', 'DayOfWeek_Monday', 'DayOfWeek_Saturday',
       'DayOfWeek_Sunday', 'DayOfWeek_Thursday', 'DayOfWeek_Tuesday',
       'DayOfWeek_Wednesday', 'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL',
       'PdDistrict_INGLESIDE', 'PdDistrict_MISSION', 'PdDistrict_NORTHERN',
       'PdDistrict_PARK', 'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN',
       'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN', 'Hour', 'Month', 'Year',
       'Category'],
      dtype='object')

In [24]:
x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size=0.33, random_state=42)

x_train, x_test, y_train, y_test

(        DayOfWeek_Friday  DayOfWeek_Monday  DayOfWeek_Saturday  \
 617510                 0                 0                   0   
 451491                 0                 0                   0   
 354809                 0                 0                   0   
 329161                 1                 0                   0   
 798989                 0                 0                   1   
 ...                  ...               ...                 ...   
 259178                 0                 1                   0   
 365838                 1                 0                   0   
 131932                 0                 0                   1   
 671155                 1                 0                   0   
 121958                 0                 1                   0   
 
         DayOfWeek_Sunday  DayOfWeek_Thursday  DayOfWeek_Tuesday  \
 617510                 0                   1                  0   
 451491                 0                   0             

In [25]:
rf = RandomForestClassifier(n_estimators=200, random_state=0, max_features=4)
rf.fit(x_train,y_train)
p_tr = rf.predict(Xtrain)
print("Score train -- ", round(rf.score(x_train, y_train) * 100, 2), " %")


Score train --  100.0  %


In [26]:
p_test = rf.predict(x_test)
print("Score test -- ", round(rf.score(x_test, y_test) * 100, 2), " %")

Score test --  94.65  %


In [31]:
filename = 'finalized_model.sav'
pickle.dump(rf, open(filename, 'wb'))