# Crime classification in San Francisco

In [1]:
# Run these to install package with pip for the right python install

import sys
#!{sys.executable} -m pip install seaborn

In [2]:
import pandas as pd
import seaborn as sns
#%matplotlib nbagg
import matplotlib.pyplot as plt
import numpy as np

def log_loss(p_pred,y):
    y_true=np.array(y)
    loss=0
    for i in range(len(p_pred)):
        if p_pred.shape[1]<=y_true[i]:
            loss-=np.log(1e-15)
        else:
            loss-=np.log(p_pred[i,y_true[i]]+1e-15)
    return loss/len(p_pred)

sns.set_theme()
plt.figure(figsize=(15,10))

<Figure size 1080x720 with 0 Axes>

<Figure size 1080x720 with 0 Axes>

In [3]:
#Read data

df = pd.read_csv('data/train.csv')
print('data loaded')

data loaded


In [4]:
#Get categories (output to find)

cat = df.Category.unique()
print(cat)
list_cat=np.copy(cat)
df.drop(['Descript','Resolution','Address','PdDistrict'],inplace =True, axis=1)  #data useless, unsignificant, estimated useless
df.drop(df[(df.Y>60)|(df.X>-122)].index, inplace=True)  #outliers
df['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)


df.head()

['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' 'VANDALISM'
 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'
 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'
 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'
 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'
 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'
 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'
 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'
 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT']


Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,-122.438738,37.771541


Unnamed: 0,Dates,Category,DayOfWeek,X,Y
0,2015-05-13 23:53:00,WARRANTS,2,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,2,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,2,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,2,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,2,-122.438738,37.771541


In [6]:
#Format time data

df.Dates = pd.to_datetime(df.Dates)
df['Time'] = df.Dates.dt.hour*3600 + df.Dates.dt.minute*60 + df.Dates.dt.second
df['Hour'] = df.Dates.dt.hour
df['Month']=df.Dates.dt.month
df['Year']=df.Dates.dt.year
for key in ['Time','Hour','Month','Year']:
    df[key]=pd.to_numeric(df[key])


df['Category'].replace(to_replace=cat,value=[i for i in range(len(cat))],inplace=True)
df.head()

Unnamed: 0,Dates,Category,DayOfWeek,X,Y,Time,Hour,Month,Year
0,2015-05-13 23:53:00,0,2,-122.425892,37.774599,85980,23,5,2015
1,2015-05-13 23:53:00,1,2,-122.425892,37.774599,85980,23,5,2015
2,2015-05-13 23:33:00,1,2,-122.424363,37.800414,84780,23,5,2015
3,2015-05-13 23:30:00,2,2,-122.426995,37.800873,84600,23,5,2015
4,2015-05-13 23:30:00,2,2,-122.438738,37.771541,84600,23,5,2015


In [7]:
#Cell to run if you want to have cyclic data

df['Sin_Time'] = np.sin(2 * np.pi * df['Time'] / (24*60*60))  # cycle rpz to explore 
df['Cos_Time'] = np.cos(2 * np.pi * df['Time'] / (24*60*60))

df['Sin_Hour'] = np.sin(2 * np.pi * df['Hour'] / 24)  # cycle rpz to explore 
df['Cos_Hour'] = np.cos(2 * np.pi * df['Hour'] / 24)

df['Sin_Month'] = np.sin(2 * np.pi * df['Month'] / 12)  # cycle rpz to explore 
df['Cos_Month'] = np.cos(2 * np.pi * df['Month'] / 12) 

df['Sin_Day'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)  # cycle rpz to explore 
df['Cos_Day'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7) 

df.drop(['Time','Hour','Month','DayOfWeek'],inplace =True, axis=1)

In [8]:
dfsample = df.sample(n=10000)
ysample=dfsample['Category']
ysample.head()
dfsample.drop('Category',inplace =True, axis=1)
dfsample.drop('Dates',inplace =True, axis=1)
dfsample.head()

Unnamed: 0,X,Y,Year,Sin_Time,Cos_Time,Sin_Hour,Cos_Hour,Sin_Month,Cos_Month,Sin_Day,Cos_Day
260645,-122.403434,37.787643,2011,-0.566406,-0.824126,-0.5,-0.866025,-0.5,0.8660254,0.433884,-0.900969
860688,-122.443479,37.798791,2003,-0.766044,-0.642788,-0.707107,-0.707107,1.0,6.123234000000001e-17,-0.781831,0.62349
226224,-122.386528,37.717395,2012,-0.224951,0.97437,-0.258819,0.965926,0.5,-0.8660254,0.433884,-0.900969
244442,-122.416775,37.763549,2012,0.480989,0.876727,0.258819,0.965926,0.5,0.8660254,-0.974928,-0.222521
213718,-122.407933,37.781506,2012,-0.117537,0.993068,-0.258819,0.965926,-0.5,-0.8660254,0.781831,0.62349


In [9]:
# Load Library
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

#X=(dfsample-dfsample.mean())/dfsample.std()
X=dfsample
y=ysample

# Split the training test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#  Fit a Random Forest model
clf = RandomForestClassifier(n_estimators=100,min_samples_split=50,random_state=42)
#clf = MLPClassifier(hidden_layer_sizes=(100),solver='sgd',learning_rate='invscaling',random_state=1)
#clf=KNeighborsClassifier(n_neighbors=1000)
clf.fit(X_train, y_train)
p_pred = clf.predict_proba(X_test)
print(p_pred.shape)


print(log_loss(p_pred,y_test))
print(accuracy_score(y_test, np.argmax(p_pred,axis=1)))


(2000, 35)
2.7144228232570238
0.243


In [10]:
test_data = pd.read_csv('data/test.csv')
print('test data loaded')
test_data.drop(['Address'],inplace =True, axis=1)  #data useless, unsignificant
test_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)
test_data.drop('PdDistrict',inplace =True, axis=1) # estimated  useless

test_data.Dates = pd.to_datetime(test_data.Dates)
test_data['Time'] = test_data.Dates.dt.hour*3600 + test_data.Dates.dt.minute*60 + test_data.Dates.dt.second
test_data['Hour'] = test_data.Dates.dt.hour
test_data['Month']=test_data.Dates.dt.month
test_data['Year']=test_data.Dates.dt.year
for key in ['Time','Hour','Month','Year']:
    test_data[key]=pd.to_numeric(test_data[key])


test_data['Sin_Time'] = np.sin(2 * np.pi * test_data['Time'] / (24*60*60))  # cycle rpz to explore 
test_data['Cos_Time'] = np.cos(2 * np.pi * test_data['Time'] / (24*60*60))

test_data['Sin_Hour'] = np.sin(2 * np.pi * test_data['Hour'] / 24)  # cycle rpz to explore 
test_data['Cos_Hour'] = np.cos(2 * np.pi * test_data['Hour'] / 24)

test_data['Sin_Month'] = np.sin(2 * np.pi * test_data['Month'] / 12)  # cycle rpz to explore 
test_data['Cos_Month'] = np.cos(2 * np.pi * test_data['Month'] / 12) 

test_data['Sin_Day'] = np.sin(2 * np.pi * test_data['DayOfWeek'] / 7)  # cycle rpz to explore 
test_data['Cos_Day'] = np.cos(2 * np.pi * test_data['DayOfWeek'] / 7) 

test_data.drop(['Id','Time','Hour','Month','DayOfWeek','Dates'],inplace =True, axis=1)

test_data.head()

test data loaded


Unnamed: 0,X,Y,Year,Sin_Time,Cos_Time,Sin_Hour,Cos_Hour,Sin_Month,Cos_Month,Sin_Day,Cos_Day
0,-122.399588,37.735051,2015,-0.004363,0.99999,-0.258819,0.965926,0.5,-0.866025,-0.781831,0.62349
1,-122.391523,37.732432,2015,-0.03926,0.999229,-0.258819,0.965926,0.5,-0.866025,-0.781831,0.62349
2,-122.426002,37.792212,2015,-0.043619,0.999048,-0.258819,0.965926,0.5,-0.866025,-0.781831,0.62349
3,-122.437394,37.721412,2015,-0.065403,0.997859,-0.258819,0.965926,0.5,-0.866025,-0.781831,0.62349
4,-122.437394,37.721412,2015,-0.065403,0.997859,-0.258819,0.965926,0.5,-0.866025,-0.781831,0.62349


In [11]:
list_cat=list(list_cat)
p_pred = clf.predict_proba(test_data)
if p_pred.shape[1]<len(list_cat):
    p_pred=np.concatenate((p_pred,np.zeros((len(p_pred),len(list_cat)-p_pred.shape[1]))),axis=1)

pred= pd.DataFrame (p_pred, columns = list_cat)
pred.insert(0,'Id',np.arange(len(p_pred)))

pred.head()
pred.to_csv("pred2.csv")

In [12]:
pred.head()

Unnamed: 0,Id,WARRANTS,OTHER OFFENSES,LARCENY/THEFT,VEHICLE THEFT,VANDALISM,NON-CRIMINAL,ROBBERY,ASSAULT,WEAPON LAWS,...,EMBEZZLEMENT,SUICIDE,LOITERING,SEX OFFENSES NON FORCIBLE,EXTORTION,GAMBLING,BAD CHECKS,TREA,RECOVERED VEHICLE,PORNOGRAPHY/OBSCENE MAT
0,0,0.02205,0.166092,0.173621,0.05956,0.050216,0.098891,0.055442,0.109069,0.020842,...,0.00348,0.0,0.0,0.0,0.001046,0.001413,0.0,0.0,0.0,0.0
1,1,0.028827,0.178132,0.127937,0.069167,0.06638,0.074242,0.069628,0.10376,0.04925,...,0.001639,0.0,0.000141,0.0,0.000323,0.002112,0.0,0.0,0.0,0.0
2,2,0.023732,0.103492,0.305443,0.052267,0.067765,0.12523,0.018068,0.0629,0.014638,...,0.000189,0.0,0.000308,0.0,0.000143,0.000339,0.0,0.0,0.0,0.0
3,3,0.034211,0.195192,0.150438,0.073916,0.049339,0.129216,0.035899,0.10875,0.014931,...,0.00143,0.0,0.000326,0.000156,0.00037,0.000709,0.0,0.0,0.0,0.0
4,4,0.034211,0.195192,0.150438,0.073916,0.049339,0.129216,0.035899,0.10875,0.014931,...,0.00143,0.0,0.000326,0.000156,0.00037,0.000709,0.0,0.0,0.0,0.0
