# Part 2: Decision Tree Baseline

### Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

### Import the Dataset

In [2]:
data = pd.read_csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')

### Selecting Crimes and Grabbing Equal Number of Examples

In [3]:
burglary = data[data['Category'].isin(['BURGLARY'])].reset_index(drop=True)
burglary = burglary.sample(n=10000).reset_index(drop=True) 

fraud = data[data['Category'].isin(['FRAUD'])].reset_index(drop=True)
fraud = fraud.sample(n=10000).reset_index(drop=True)

data = pd.concat([burglary, fraud], axis=0)
data = data.sample(frac=1).reset_index(drop=True)

### Creating Month of the Year Column

In [4]:
data['MonthOfYear'] = data['Date'].str.strip().str[0:2]

### Creating Hour of the Day Column

In [5]:
data['HourOfDay'] = data['Time'].str.strip().str[0:2]

 ### Creating the DataFrame

In [6]:
data = data[['Category', 'PdDistrict', 'HourOfDay', 'DayOfWeek', 'MonthOfYear']].copy()

### Turn Attributes to Integer Type

In [7]:
#District
district = pd.get_dummies(data.PdDistrict)
data = pd.concat([data, district], axis=1)
data.drop('PdDistrict', axis=1, inplace=True)

#Day of Week
dayofweek = pd.DataFrame(pd.Categorical(data.DayOfWeek, categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 
                                        ordered=True).codes, columns=['Day'])
data = pd.concat([data, dayofweek], axis=1)
data.drop('DayOfWeek', axis=1, inplace=True)

#Hour of Day
data['HourOfDay'] = data['HourOfDay'].astype(int)

#Month of Year
data['MonthOfYear'] = data['MonthOfYear'].astype(int)

### Splitting Data

In [8]:
train, test = train_test_split(data, test_size=0.2)

x_train = train.iloc[:, 1:].values
y_train = train.iloc[:, 0].values
x_test = test.iloc[:, 1:].values
y_test = test.iloc[:, 0].values

### Decision Tree

In [9]:
dtc = DecisionTreeClassifier() 
dtc = dtc.fit(x_train, y_train)

### Prediction

In [10]:
predictions = dtc.predict(x_test)

### Metrics

In [11]:
print(confusion_matrix(y_test,predictions))

[[1268  781]
 [ 964  987]]


In [12]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    BURGLARY       0.57      0.62      0.59      2049
       FRAUD       0.56      0.51      0.53      1951

    accuracy                           0.56      4000
   macro avg       0.56      0.56      0.56      4000
weighted avg       0.56      0.56      0.56      4000



### Random Forest

In [13]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=1000)

### Prediction

In [14]:
predictions = rf.predict(x_test)

### Metrics

In [15]:
print(confusion_matrix(y_test,predictions))

[[1200  849]
 [ 833 1118]]


In [16]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    BURGLARY       0.59      0.59      0.59      2049
       FRAUD       0.57      0.57      0.57      1951

    accuracy                           0.58      4000
   macro avg       0.58      0.58      0.58      4000
weighted avg       0.58      0.58      0.58      4000

