In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import and Data Preparation

In [None]:
path = "/content/drive/Shareddrives/DATA 245 - ML/cleaned_SF_crime_report.csv"

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from pandas.io.formats.info import DataFrameTableBuilderVerbose

from sklearn.preprocessing import LabelEncoder


In [None]:
df = pd.read_csv(path)
pd.set_option('display.max_columns',None)
df = df.iloc[-100000:,:]

In [None]:
# drop columns
df.drop(columns=['Incident_Year','Row_ID','Incident_ID','Incident_Number','Report_Type_Description','Filed_Online','Incident_Code',
                'Incident_Subcategory','Incident_Description','Resolution','Report_Datetime'], inplace=True)
df

Unnamed: 0,Incident_Date,Incident_Time,Incident_Day_of_Week,Report_Type_Code,Incident_Category,CNN,Police_District,Analysis_Neighborhood,Supervisor_District,Latitude,Longitude
454712,2020/01/13,10:20,Monday,II,Larceny Theft,24904000.0,Central,Tenderloin,3.0,37.786410,-122.408036
454713,2019/07/20,23:30,Saturday,II,Larceny Theft,25793000.0,Mission,Castro/Upper Market,8.0,37.762551,-122.434062
454714,2020/01/19,05:00,Sunday,II,Larceny Theft,21876000.0,Ingleside,Noe Valley,8.0,37.742078,-122.426846
454715,2019/12/26,16:30,Thursday,II,Prostitution,20650000.0,Bayview,Portola,9.0,37.728835,-122.404100
454716,2019/09/30,07:00,Monday,II,Malicious Mischief,24142000.0,Mission,Mission,9.0,37.765183,-122.417487
...,...,...,...,...,...,...,...,...,...,...,...
554707,2021/01/15,20:00,Friday,II,Robbery,24944000.0,Tenderloin,Tenderloin,6.0,37.785893,-122.412148
554708,2020/08/01,17:34,Saturday,II,Larceny Theft,27500000.0,Richmond,Outer Richmond,1.0,37.780476,-122.476169
554709,2020/06/01,01:55,Monday,II,Burglary,23610000.0,Bayview,Potrero Hill,10.0,37.754000,-122.389860
554710,2020/10/24,17:00,Saturday,II,Larceny Theft,26042000.0,Northern,Western Addition,5.0,37.777490,-122.433219


In [None]:
# seperate date and time
df['Year'] = df['Incident_Date'].apply(lambda x: int(x[0:4]))
df['Month'] = df['Incident_Date'].apply(lambda x: int(x[5:7]))
df['Day'] = df['Incident_Date'].apply(lambda x: int(x[8:10]))

df['Hour'] = df['Incident_Time'].apply(lambda x: int(x[0:2]))
df['Min'] = df['Incident_Time'].apply(lambda x: int(x[3:5]))
df

Unnamed: 0,Incident_Date,Incident_Time,Incident_Day_of_Week,Report_Type_Code,Incident_Category,CNN,Police_District,Analysis_Neighborhood,Supervisor_District,Latitude,Longitude,Year,Month,Day,Hour,Min
454712,2020/01/13,10:20,Monday,II,Larceny Theft,24904000.0,Central,Tenderloin,3.0,37.786410,-122.408036,2020,1,13,10,20
454713,2019/07/20,23:30,Saturday,II,Larceny Theft,25793000.0,Mission,Castro/Upper Market,8.0,37.762551,-122.434062,2019,7,20,23,30
454714,2020/01/19,05:00,Sunday,II,Larceny Theft,21876000.0,Ingleside,Noe Valley,8.0,37.742078,-122.426846,2020,1,19,5,0
454715,2019/12/26,16:30,Thursday,II,Prostitution,20650000.0,Bayview,Portola,9.0,37.728835,-122.404100,2019,12,26,16,30
454716,2019/09/30,07:00,Monday,II,Malicious Mischief,24142000.0,Mission,Mission,9.0,37.765183,-122.417487,2019,9,30,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554707,2021/01/15,20:00,Friday,II,Robbery,24944000.0,Tenderloin,Tenderloin,6.0,37.785893,-122.412148,2021,1,15,20,0
554708,2020/08/01,17:34,Saturday,II,Larceny Theft,27500000.0,Richmond,Outer Richmond,1.0,37.780476,-122.476169,2020,8,1,17,34
554709,2020/06/01,01:55,Monday,II,Burglary,23610000.0,Bayview,Potrero Hill,10.0,37.754000,-122.389860,2020,6,1,1,55
554710,2020/10/24,17:00,Saturday,II,Larceny Theft,26042000.0,Northern,Western Addition,5.0,37.777490,-122.433219,2020,10,24,17,0


In [None]:
def time_of_day(hour):
    if hour >= 0 and hour < 4:
        return 'Late Night'
    elif hour >= 4 and hour < 6:
        return 'Dawn'
    elif hour >= 6 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 16:
        return 'Afternoon'
    elif hour >= 16 and hour < 20:
        return 'Evening'
    elif hour >= 20 and hour <= 23:
        return 'Night'

In [None]:
def business_hours(hours):
    return 8 <= hours <= 18

In [None]:
def third_of_month(day):
    if day >= 1 and day < 10:
        return '1st_third'
    elif day >= 10 and day < 20:
        return '2nd_third'
    elif day >= 20 and day <= 31:
        return '3rd_third'

In [None]:
def weekend(weekday):
    if weekday == "Saturday" or weekday == "Sunday":
        return True
    else:
        return False

In [None]:
def quarter_of_year(month):
    if month >= 1 and month <= 3:
        return '1st_quarter'
    elif month > 3 and month <= 6:
        return '2nd_quarter'
    elif month > 6 and month <= 9:
        return '3rd_quarter'
    elif month > 9 and month <= 12:
        return '4th_quarter'

In [None]:

df['Time_of_Day'] = df['Hour'].map(time_of_day)
df['Business Hours'] = df['Hour'].map(business_hours)
df['Business Hours'].value_counts()
df['Third_of_Month'] = df['Day'].map(third_of_month)
df['Weekend'] = df['Incident_Day_of_Week'].map(weekend)
df['Quarter_of_Year'] = df['Month'].map(quarter_of_year)
df["Incident_Date"] = pd.to_datetime(df["Incident_Date"], format="%Y-%m-%d %H:%M:%S")

cal = calendar()
holidays = cal.holidays(start=df['Incident_Date'].min(), end=df['Incident_Date'].max())
df['Holiday'] = df['Incident_Date'].dt.date.astype('datetime64').isin(holidays)
df

Unnamed: 0,Incident_Date,Incident_Time,Incident_Day_of_Week,Report_Type_Code,Incident_Category,CNN,Police_District,Analysis_Neighborhood,Supervisor_District,Latitude,Longitude,Year,Month,Day,Hour,Min,Time_of_Day,Business Hours,Third_of_Month,Weekend,Quarter_of_Year,Holiday
454712,2020-01-13,10:20,Monday,II,Larceny Theft,24904000.0,Central,Tenderloin,3.0,37.786410,-122.408036,2020,1,13,10,20,Morning,True,2nd_third,False,1st_quarter,False
454713,2019-07-20,23:30,Saturday,II,Larceny Theft,25793000.0,Mission,Castro/Upper Market,8.0,37.762551,-122.434062,2019,7,20,23,30,Night,False,3rd_third,True,3rd_quarter,False
454714,2020-01-19,05:00,Sunday,II,Larceny Theft,21876000.0,Ingleside,Noe Valley,8.0,37.742078,-122.426846,2020,1,19,5,0,Dawn,False,2nd_third,True,1st_quarter,False
454715,2019-12-26,16:30,Thursday,II,Prostitution,20650000.0,Bayview,Portola,9.0,37.728835,-122.404100,2019,12,26,16,30,Evening,True,3rd_third,False,4th_quarter,False
454716,2019-09-30,07:00,Monday,II,Malicious Mischief,24142000.0,Mission,Mission,9.0,37.765183,-122.417487,2019,9,30,7,0,Morning,False,3rd_third,False,3rd_quarter,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554707,2021-01-15,20:00,Friday,II,Robbery,24944000.0,Tenderloin,Tenderloin,6.0,37.785893,-122.412148,2021,1,15,20,0,Night,False,2nd_third,False,1st_quarter,False
554708,2020-08-01,17:34,Saturday,II,Larceny Theft,27500000.0,Richmond,Outer Richmond,1.0,37.780476,-122.476169,2020,8,1,17,34,Evening,True,1st_third,True,3rd_quarter,False
554709,2020-06-01,01:55,Monday,II,Burglary,23610000.0,Bayview,Potrero Hill,10.0,37.754000,-122.389860,2020,6,1,1,55,Late Night,False,1st_third,False,2nd_quarter,False
554710,2020-10-24,17:00,Saturday,II,Larceny Theft,26042000.0,Northern,Western Addition,5.0,37.777490,-122.433219,2020,10,24,17,0,Evening,True,3rd_third,True,4th_quarter,False


In [None]:
df.drop(columns = ['Year','Month','Day','Hour','Min','Incident_Date','Incident_Time','CNN','Police_District'],inplace=True)
df

Unnamed: 0,Incident_Day_of_Week,Report_Type_Code,Incident_Category,Analysis_Neighborhood,Supervisor_District,Latitude,Longitude,Time_of_Day,Business Hours,Third_of_Month,Weekend,Quarter_of_Year,Holiday
454712,Monday,II,Larceny Theft,Tenderloin,3.0,37.786410,-122.408036,Morning,True,2nd_third,False,1st_quarter,False
454713,Saturday,II,Larceny Theft,Castro/Upper Market,8.0,37.762551,-122.434062,Night,False,3rd_third,True,3rd_quarter,False
454714,Sunday,II,Larceny Theft,Noe Valley,8.0,37.742078,-122.426846,Dawn,False,2nd_third,True,1st_quarter,False
454715,Thursday,II,Prostitution,Portola,9.0,37.728835,-122.404100,Evening,True,3rd_third,False,4th_quarter,False
454716,Monday,II,Malicious Mischief,Mission,9.0,37.765183,-122.417487,Morning,False,3rd_third,False,3rd_quarter,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
554707,Friday,II,Robbery,Tenderloin,6.0,37.785893,-122.412148,Night,False,2nd_third,False,1st_quarter,False
554708,Saturday,II,Larceny Theft,Outer Richmond,1.0,37.780476,-122.476169,Evening,True,1st_third,True,3rd_quarter,False
554709,Monday,II,Burglary,Potrero Hill,10.0,37.754000,-122.389860,Late Night,False,1st_third,False,2nd_quarter,False
554710,Saturday,II,Larceny Theft,Western Addition,5.0,37.777490,-122.433219,Evening,True,3rd_third,True,4th_quarter,False


In [None]:
# Reduce the categories

df = df.loc[df['Incident_Category'].isin(['Larceny Theft', 'Other Miscellaneous', 'Malicious Mischief', 'Assault', 'Non-Criminal', 'Burglary', 'Motor Vehicle Theft',
              'Warrant', 'Fraud', 'Recovered Vehicle', 'Lost Property'])]
df.reset_index(inplace=True)
df.drop(columns=['index'],inplace=True)
df     

Unnamed: 0,Incident_Day_of_Week,Report_Type_Code,Incident_Category,Analysis_Neighborhood,Supervisor_District,Latitude,Longitude,Time_of_Day,Business Hours,Third_of_Month,Weekend,Quarter_of_Year,Holiday
0,Monday,II,Malicious Mischief,Mission,9.0,37.765183,-122.417487,Morning,False,3rd_third,False,3rd_quarter,False
1,Tuesday,II,Malicious Mischief,Financial District/South Beach,3.0,37.786615,-122.406399,Evening,True,3rd_third,False,2nd_quarter,False
2,Friday,II,Non-Criminal,Russian Hill,3.0,37.795027,-122.421583,Night,False,1st_third,False,1st_quarter,False
3,Monday,VS,Motor Vehicle Theft,Mission,9.0,37.761836,-122.419359,Night,False,1st_third,False,4th_quarter,False
4,Tuesday,VS,Recovered Vehicle,Bayview Hunters Point,10.0,37.718487,-122.393545,Evening,True,3rd_third,False,3rd_quarter,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53698,Thursday,II,Assault,Inner Sunset,7.0,37.750820,-122.468674,Afternoon,True,1st_third,False,4th_quarter,False
53699,Wednesday,II,Other Miscellaneous,Financial District/South Beach,6.0,37.784044,-122.403712,Afternoon,True,1st_third,False,3rd_quarter,False
53700,Tuesday,II,Assault,Marina,2.0,37.799788,-122.429290,Night,False,2nd_third,False,4th_quarter,False
53701,Saturday,II,Other Miscellaneous,Inner Richmond,1.0,37.777136,-122.466154,Evening,False,2nd_third,True,3rd_quarter,False


In [None]:
import warnings
warnings.filterwarnings("ignore")

enc = LabelEncoder()
df['Analysis_Neighborhood'] = enc.fit_transform(df['Analysis_Neighborhood'])
df['Business Hours'] = enc.fit_transform(df['Business Hours'])
df['Holiday'] = enc.fit_transform(df['Holiday'])
df['Incident_Day_of_Week'] = enc.fit_transform(df['Incident_Day_of_Week'])
df['Quarter_of_Year'] = enc.fit_transform(df['Quarter_of_Year'])
#df['Report_Type_Code'] = enc.fit_transform(df['Report_Type_Code'])
df['Supervisor_District'] = enc.fit_transform(df['Supervisor_District'])
df['Third_of_Month'] = enc.fit_transform(df['Third_of_Month'])
df['Weekend'] = enc.fit_transform(df['Weekend'])
df['Incident_Category'] = enc.fit_transform(df['Incident_Category'])

df['Time_of_Day'] = enc.fit_transform(df['Time_of_Day'])

In [None]:
df

Unnamed: 0,Incident_Day_of_Week,Report_Type_Code,Incident_Category,Analysis_Neighborhood,Supervisor_District,Latitude,Longitude,Time_of_Day,Business Hours,Third_of_Month,Weekend,Quarter_of_Year,Holiday
0,1,0,4,18,8,37.765183,-122.417487,4,0,2,0,2,0
1,5,0,4,5,2,37.786615,-122.406399,2,1,2,0,1,0
2,0,0,6,31,2,37.795027,-122.421583,5,0,0,0,0,0
3,1,3,5,18,8,37.761836,-122.419359,5,0,0,0,3,0
4,5,3,8,0,9,37.718487,-122.393545,2,1,2,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53698,4,0,0,11,6,37.750820,-122.468674,0,1,0,0,3,0
53699,6,0,7,5,5,37.784044,-122.403712,0,1,0,0,2,0
53700,5,0,0,16,1,37.799788,-122.429290,5,0,1,0,3,0
53701,2,0,7,10,0,37.777136,-122.466154,2,0,1,1,2,0


# Preprocessing and Modelling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef

In [None]:
# Split the data
X = df.loc[:, df.columns != 'Report_Type_Code']
y = df['Report_Type_Code']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train.shape, X_test.shape

((37592, 12), (16111, 12))

In [None]:
# # Scaling the data

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
# Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('RF',accuracy_score(y_pred,y_test),matthews_corrcoef(y_pred,y_test))

RF 0.8724473961889392 0.7109260308633679


In [None]:
# KNN

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('KNN',accuracy_score(y_pred,y_test),matthews_corrcoef(y_pred,y_test))

KNN 0.7770467382533672 0.4554876804801454


In [None]:
# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('NB',accuracy_score(y_pred,y_test),matthews_corrcoef(y_pred,y_test))

NB 0.8724473961889392 0.7109260308633679


In [None]:
# SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print('SVC',accuracy_score(y_pred,y_test),matthews_corrcoef(y_pred,y_test))

SVC 0.7127428465023897 0.0
