# Part 3: Beyond the Baseline with Weather

### Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

### Import Data

In [2]:
data = pd.read_csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')

### Selecting Crimes 

In [3]:
crimes = ['BURGLARY', 'FRAUD']
data = data[data['Category'].isin(crimes)].reset_index(drop=True)

### Creating Datetime

In [4]:
data = data[['Category', 'PdDistrict', 'Time', 'DayOfWeek', 'Date']].copy()
data['date'] = data['Date'].str.strip().str[-4:] + '-' + data['Date'].str.strip().str[0:2] + '-' + data['Date'].str.strip().str[3:5] + 'T' + data['Time'].str.strip().str[0:2] + ':00:00.000Z' 

### Import Weather Data

In [5]:
url = 'https://raw.githubusercontent.com/suneman/socialdata2021/master/files/weather_data.csv'
weather = pd.read_csv(url, error_bad_lines=False)

### Creating Weather Data

In [6]:
weather_data = pd.merge(data, weather, on='date', how='left')
weather_data.dropna(axis=0, inplace=True)
weather_data.reset_index(drop=True, inplace=True)

### Grabbing Equal Number of Examples

In [7]:
burglary = weather_data[weather_data['Category'].isin(['BURGLARY'])].reset_index(drop=True)
burglary = burglary.sample(n=10000).reset_index(drop=True) 

fraud = weather_data[weather_data['Category'].isin(['FRAUD'])].reset_index(drop=True)
fraud = fraud.sample(n=10000).reset_index(drop=True)

weather_data = pd.concat([burglary, fraud], axis=0)
weather_data = weather_data.sample(frac=1).reset_index(drop=True)

### Creating Month of the Year Column

In [8]:
weather_data['MonthOfYear'] = weather_data['Date'].str.strip().str[0:2]

### Creating Hour of the Day Column

In [9]:
weather_data['HourOfDay'] = weather_data['Time'].str.strip().str[0:2]

### Dropping Attributes

In [10]:
weather_data.drop(['Time', 'Date','date'], axis=1, inplace=True)

### Turn Attributes to Integer Type

In [11]:
#District
district = pd.get_dummies(weather_data.PdDistrict)
weather_data = pd.concat([weather_data, district], axis=1)
weather_data.drop('PdDistrict', axis=1, inplace=True)

#Weather
weather = pd.get_dummies(weather_data.weather)
weather_data = pd.concat([weather_data, weather], axis=1)
weather_data.drop('weather', axis=1, inplace=True)

#Day of Week
dayofweek = pd.DataFrame(pd.Categorical(weather_data.DayOfWeek, categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 
               ordered=True).codes, columns=['Day'])
weather_data = pd.concat([weather_data, dayofweek], axis=1)
weather_data.drop('DayOfWeek', axis=1, inplace=True)

#Hour of Day
weather_data['HourOfDay'] = weather_data['HourOfDay'].astype(int)

#Month of Year
weather_data['MonthOfYear'] = weather_data['MonthOfYear'].astype(int)

### Splitting Data

In [12]:
train, test = train_test_split(weather_data, test_size=0.2)

x_train = train.iloc[:, 1:].values
y_train = train.iloc[:, 0].values
x_test = test.iloc[:, 1:].values
y_test = test.iloc[:, 0].values

### Decision Tree

In [13]:
dtc = DecisionTreeClassifier() 
dtc = dtc.fit(x_train, y_train)

### Prediction

In [14]:
predictions = dtc.predict(x_test)

### Metrics

In [15]:
print(confusion_matrix(y_test,predictions))

[[1086  910]
 [ 867 1137]]


In [16]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    BURGLARY       0.56      0.54      0.55      1996
       FRAUD       0.56      0.57      0.56      2004

    accuracy                           0.56      4000
   macro avg       0.56      0.56      0.56      4000
weighted avg       0.56      0.56      0.56      4000



### Random Forest

In [17]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=1000)

### Prediction

In [18]:
predictions = rf.predict(x_test)

### Metrics

In [19]:
print(confusion_matrix(y_test,predictions))

[[1205  791]
 [ 742 1262]]


In [20]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    BURGLARY       0.62      0.60      0.61      1996
       FRAUD       0.61      0.63      0.62      2004

    accuracy                           0.62      4000
   macro avg       0.62      0.62      0.62      4000
weighted avg       0.62      0.62      0.62      4000

