In [43]:
import pandas as pd
from string import ascii_letters
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [44]:
df = pd.read_csv('criminal_san_francisco.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,X,Y
0,0,0,2010-10-18 13:37:00,ASSAULT,BATTERY,Monday,SOUTHERN,NONE,-122.407376,37.779989
1,1,1,2013-01-11 03:21:00,ASSAULT,AGGRAVATED ASSAULT WITH A DEADLY WEAPON,Friday,TENDERLOIN,NONE,-122.412437,37.783486
2,2,2,2004-03-31 15:00:00,ASSAULT,BATTERY,Wednesday,INGLESIDE,JUVENILE ADMONISHED,-122.451164,37.745564
3,3,3,2005-02-27 22:33:00,ASSAULT,THREATS AGAINST LIFE,Sunday,INGLESIDE,NONE,-122.428614,37.722765
4,4,4,2013-06-30 17:39:00,ASSAULT,BATTERY,Sunday,MISSION,NONE,-122.418449,37.753027


In [45]:
df1 = df.copy()

In [46]:
# TASK 1
df=df.drop(['Unnamed: 0','Unnamed: 0.1','Descript','Dates'], axis=1)
df.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,Resolution,X,Y
0,ASSAULT,Monday,SOUTHERN,NONE,-122.407376,37.779989
1,ASSAULT,Friday,TENDERLOIN,NONE,-122.412437,37.783486
2,ASSAULT,Wednesday,INGLESIDE,JUVENILE ADMONISHED,-122.451164,37.745564
3,ASSAULT,Sunday,INGLESIDE,NONE,-122.428614,37.722765
4,ASSAULT,Sunday,MISSION,NONE,-122.418449,37.753027


In [47]:
from sklearn.preprocessing import LabelEncoder

def get_encoded_df(df, columns=None):
    if columns == None:
        columns = df.columns

    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    return df

In [48]:
df_encoded = get_encoded_df(df)
df_encoded['X']= df1['X']
df_encoded['Y']= df1['Y']
df_encoded

Unnamed: 0,Category,DayOfWeek,PdDistrict,Resolution,X,Y
0,0,1,7,11,-122.407376,37.779989
1,0,0,9,11,-122.412437,37.783486
2,0,6,2,6,-122.451164,37.745564
3,0,3,2,11,-122.428614,37.722765
4,0,3,3,11,-122.418449,37.753027
...,...,...,...,...,...,...
89995,8,2,8,0,-122.451157,37.716582
89996,8,5,1,0,-122.407562,37.798678
89997,8,6,2,0,-122.417108,37.712256
89998,8,6,3,0,-122.420010,37.770110


Here we will use classifier decision tree algorithm as the dependent column (i.e. Category) is a categorical in nature.

In [49]:
X = df_encoded.iloc[:,1:6]
Y = df_encoded.iloc[:,0:1]

X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.1, random_state=11)

In [50]:
clf = tree.DecisionTreeClassifier().fit(X_train, Y_train)
print(clf.score(X_test, Y_test))

0.33166666666666667


In [51]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"], "max_iter":[10,50,100,500]}
grid = {'criterion':['gini', 'entropy'],'max_depth': [2,4,6,8,10,12]}
clf = tree.DecisionTreeClassifier()
clf_cv=GridSearchCV(clf,grid, cv=5)
clf_cv.fit(X_train,Y_train)
print("tuned hpyerparameters :(best parameters) ",clf_cv.best_params_)

tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 12}


In [52]:
clf = tree.DecisionTreeClassifier(criterion='gini', max_depth=12).fit(X_train, Y_train)
print(clf.score(X_test, Y_test))

0.3701111111111111


In [54]:
kf = KFold(n_splits=5)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"], "max_iter":[10,50,100,500]}
    grid = {'criterion':['gini', 'entropy'],'max_depth': [2,4,6,8,10,12]}
    clf = tree.DecisionTreeClassifier()

    clf_cv=GridSearchCV(clf,grid, cv=5)
    clf_cv.fit(X_train,Y_train)
    print("tuned hpyerparameters :(best parameters) ",clf_cv.best_params_)
    clf = tree.DecisionTreeClassifier(criterion=clf_cv.best_params_['criterion'], max_depth=clf_cv.best_params_['max_depth']).fit(X_train, Y_train)
    s = clf.score(X_test, Y_test)
    print(s)
    scores.append(s)

tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 12}
0.028166666666666666
tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 10}
0.2546111111111111
tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 10}
0.15055555555555555
tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 12}
0.051611111111111115
tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 12}
0.0022222222222222222


In [55]:
skf = StratifiedKFold(n_splits=5)
scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"], "max_iter":[10,50,100,500]}
    
    grid = {'criterion':['gini', 'entropy'],'max_depth': [2,4,6,8,10,12]}
    clf = tree.DecisionTreeClassifier()

    clf_cv=GridSearchCV(clf,grid)
    clf_cv.fit(X_train,Y_train)
    print("tuned hpyerparameters :(best parameters) ",clf_cv.best_params_)
    clf = tree.DecisionTreeClassifier(criterion=clf_cv.best_params_['criterion'], max_depth=clf_cv.best_params_['max_depth']).fit(X_train, Y_train)
    s = clf.score(X_test, Y_test)
    print(s)
    scores.append(s)

tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 12}
0.028055555555555556
tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 10}
0.2545
tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 10}
0.15055555555555555
tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 12}
0.051111111111111114
tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 12}
0.002277777777777778
