In [69]:
import numpy as np
import pandas as pd

from IPython.display import display
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()
import folium
import calendar
import os 
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

In [30]:
""" Data : !=2018 """
fileName = 'Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv'
filePath = os.path.abspath(os.path.join(os.getcwd(), '..' ,'Datasets', fileName))
Data = pd.read_csv(filePath)
Data['Year'] = pd.to_datetime(Data['Date']).dt.year
Data = Data[Data['Year'] != 2018]

# """ data : Data in fc """
# focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])
# focuscrimes_lst = list(focuscrimes)
# focuscrimes_lst.sort()
# data = Data[Data['Category'].isin(focuscrimes_lst)]

In [31]:
# Data['Category'].value_counts()

Let's start:

In [32]:
#dataset = data[data['Categoruy']]
# data[data['Category'].isin(["PROSTITUTION", "ROBBERY"])]

data = Data[Data['Category'].isin(['BURGLARY', 'FORGERY/COUNTERFEITING'])]

data["datetime"] = data.apply(lambda x: pd.to_datetime(x.Date + " " + x.Time).round("H").tz_localize("ETC/GMT-7"), axis = 1) 

# Here we do a bit more complicated thing
# .apply allows us to use function for each row of a dataframe (read documentation for more info)
# so we take a row (which is x) and take cell of Date and Time -> and concatenate them to one big string
# that can be then converted to datetime. We would also want to remove any seconds and minutes (round to hours)
# then we specify that dates are in GMT-7
# the result is going to be stored in new "datetime" column

#it might take some time

# now you  can merge two datasets

data['Date'] = pd.to_datetime(data["Date"])
burglary = data[data['Category'].isin(["BURGLARY"])]
forgery = data[data['Category'].isin(["FORGERY/COUNTERFEITING"])]

print("Dimmentionality of the data before:")
print("Burglary", burglary.shape)
print("Forgery", forgery.shape)

Dimmentionality of the data before:
Burglary (88971, 37)
Forgery (22800, 37)


In [33]:
burglary = resample(burglary, 
                                replace=False,    # sample without replacement
                                n_samples=10000,     # to match minority_2 class
                                random_state=123) # reproducible results

forgery = resample(forgery, 
                                replace=False,    # sample without replacement
                                n_samples=10000,     # to match minority_2 class
                                random_state=123) # reproducible results

In [34]:
print("Dimmentionality of the data after:")
print("Prostitution", burglary.shape)
print("Robbery", forgery.shape)

Dimmentionality of the data after:
Prostitution (10000, 37)
Robbery (10000, 37)


In [35]:
dataset = pd.concat([burglary, forgery])
dataset['Date'] = pd.to_datetime(dataset['Date'])

In [36]:
# dataset[['Category', 'PdDistrict', 'DayOfWeek', 'Date', 'Time', 'Year']]

In [37]:
dataset['Month'] = pd.to_datetime(dataset['Date']).dt.month
dataset['Time_numeric'] = pd.to_datetime(dataset['Time']).dt.hour

In [38]:
# day_to_num(x)  
#   takes a categorical value from 'DayOfWeek' features and
#   returns a numeric number 1-7 accordingly (1 is Monday).
def day_to_num(x):
    if(x=='Monday'): return 1
    elif(x=='Tuesday'): return 2
    elif(x=='Wednesday'): return 3
    elif(x=='Thursday'): return 4
    elif(x=='Friday'): return 5
    elif(x=='Saturday'): return 6
    elif(x=='Sunday'): return 7
    else: return float('nan')

dataset['DayOfWeek_numeric'] = dataset['DayOfWeek'].apply(lambda x: day_to_num(x)).astype('int64')

In [39]:
# dataset['PdDistrict'].value_counts()

# def district_to_num(x):
#     for i,district in enumerate (dataset['PdDistrict'].unique()):
#         if (x==district): return i

# dataset['PdDistrict_num'] = dataset['PdDistrict'].apply(lambda x: district_to_num(x)).astype('int64')

# or with dummy variables 
# pd.get_dummies(dataset['PdDistrict'])
dataset2 = pd.get_dummies(dataset, columns=['PdDistrict'])

Now we are going to to build is a decision tree (or, even better, a Random Forest, here is another tutorial for Random Forests) classifier that takes as input the four labels (Hour-of-the-day, Day-of-the-week, Month-of-the-year, and PD-District) of a crime (from one of the two categories) and then tries to predict which category that crime is from. >

In [40]:
features = dataset2.iloc[ :, -13:].copy()
targets = dataset2.loc[:, ['Category']].copy()

In [41]:
# split train and test sets 
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33, random_state=42)

In [42]:
# decision tree classifier
model = DecisionTreeClassifier()
decision_tree = model.fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)

In [43]:
def evaluate(y_test, y_pred):
    classes = list(set(y_test['Category'].unique()))
    class00 = 0
    class11 = 0
    class01 = 0
    class10 = 0
    
    for i in range(len(y_test)):
        if y_test['Category'].iloc[i] == classes[0] and y_pred[i] == classes[0]:
            class00 += 1
        elif y_test['Category'].iloc[i] == classes[1] and y_pred[i] == classes[1]:
            class11 += 1
        elif y_test['Category'].iloc[i] == classes[0] and y_pred[i] == classes[1]:
            class01 += 1
        else:
            class10 += 1
            
    precision = (class00 + class11) / (class00 + class01 + class10 + class11)
    return(precision)

In [44]:
evaluate(y_test, y_pred)

0.593939393939394

Does one hot encoding affect your results? Why/Why not?

In [45]:
def district_to_num(x):
    for i,district in enumerate (dataset['PdDistrict'].unique()):
        if (x==district): return i

dataset['PdDistrict_num'] = dataset['PdDistrict'].apply(lambda x: district_to_num(x)).astype('int64')

In [46]:
features2 = dataset.loc[:, ['Month', 'Time_numeric', 'DayOfWeek_numeric', 'PdDistrict_num']].copy()
targets2 = dataset.loc[:, ['Category']].copy()

In [47]:
X_train, X_test, y_train, y_test = train_test_split(features2, targets2, test_size=0.33, random_state=42)

In [48]:
model = DecisionTreeClassifier()
decision_tree = model.fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)

In [49]:
evaluate(y_test, y_pred)

0.5959090909090909

## Part 3 

In [50]:
weather = pd.read_csv("https://raw.githubusercontent.com/suneman/socialdata2021/master/files/weather_data.csv", parse_dates=["date"],
                date_parser=lambda x: pd.to_datetime(x).tz_convert(None).tz_localize("Etc/GMT+3").tz_convert("Etc/GMT-7")) 
# parse_dates specifies what columns contain dates (instead of a string column -> it becomes a date_time column)
# data_parser -> we specify our custom date_parser (Pandas has default data_parser, usually we do not need to specify it)
# in our data_parser we use "lambda" function - it means that we want to apply something to each value in the column
# pd.to_datetime(x) - converts each value to date_time obect. By default pd.to_datetime assigns GMT0 timezone, 
# which is wrong, thus, we specification of timezone with tz_convert(None)
# now we want to specify the correct timezone -> we use tz_localize("..")
# after we can convert dates to the actual SanFrancisco timezone with tz_convert("..")
weather.head()

Unnamed: 0,date,temperature,humidity,weather,wind_speed,wind_direction,pressure
0,2012-10-01 23:00:00+07:00,16.33,88.0,light rain,2.0,150.0,1009.0
1,2012-10-02 00:00:00+07:00,16.324993,87.0,sky is clear,2.0,147.0,1009.0
2,2012-10-02 01:00:00+07:00,16.310618,86.0,sky is clear,2.0,141.0,1009.0
3,2012-10-02 02:00:00+07:00,16.296243,85.0,sky is clear,2.0,135.0,1009.0
4,2012-10-02 03:00:00+07:00,16.281869,84.0,sky is clear,2.0,129.0,1009.0


In [51]:
# dataset["datetime"] = dataset.apply(lambda x: pd.to_datetime(x.Date + " " + x.Time).round("H").tz_localize("ETC/GMT-7"), axis = 1) 

# # Here we do a bit more complicated thing
# # .apply allows us to use function for each row of a dataframe (read documentation for more info)
# # so we take a row (which is x) and take cell of Date and Time -> and concatenate them to one big string
# # that can be then converted to datetime. We would also want to remove any seconds and minutes (round to hours)
# # then we specify that dates are in GMT-7
# # the result is going to be stored in new "datetime" column

# #it might take some time

# # now you  can merge two datasets
# dataset.head()

In [54]:
Crimes_Weather = pd.merge(dataset, weather, left_on='datetime', right_on='date')

In [64]:
Crimes_Weather['weather'].value_counts() #.isna().sum() #.value_counts()

sky is clear                        1531
mist                                 957
broken clouds                        455
few clouds                           420
scattered clouds                     404
light rain                           348
overcast clouds                      298
haze                                 271
fog                                  129
moderate rain                        120
smoke                                 27
heavy intensity rain                  25
proximity shower rain                 19
light intensity drizzle               16
proximity thunderstorm                13
thunderstorm                          12
drizzle                                9
heavy snow                             5
thunderstorm with light rain           3
thunderstorm with rain                 2
proximity thunderstorm with rain       1
light intensity shower rain            1
squalls                                1
light snow                             1
Name: weather, d

In [66]:
Crimes_Weather['Weather_numerical'] = pd.Categorical(Crimes_Weather['weather']).codes

In [67]:
Crimes_Weather

Unnamed: 0,PdId,IncidntNum,Incident Code,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,...,DayOfWeek_numeric,PdDistrict_num,date,temperature,humidity,weather,wind_speed,wind_direction,pressure,Weather_numerical
0,13036344705011,130363447,5011,BURGLARY,"BURGLARY OF APARTMENT HOUSE, FORCIBLE ENTRY",Friday,2013-05-03,16:00,CENTRAL,NONE,...,5,3,2013-05-03 16:00:00+07:00,17.270000,52.0,sky is clear,4.0,130.0,1020.0,18
1,13037119905071,130371199,5071,BURGLARY,"BURGLARY, FORCIBLE ENTRY",Friday,2013-05-03,16:30,SOUTHERN,NONE,...,5,2,2013-05-03 16:00:00+07:00,17.270000,52.0,sky is clear,4.0,130.0,1020.0,18
2,16033022805053,160330228,5053,BURGLARY,"BURGLARY OF STORE, UNLAWFUL ENTRY",Friday,2016-04-22,10:30,CENTRAL,"ARREST, BOOKED",...,5,3,2016-04-22 10:00:00+07:00,19.960000,70.0,broken clouds,4.0,220.0,996.0,0
3,14033104205121,140331042,5121,BURGLARY,"BURGLARY,FLAT UNDER CONSTRUCTION, FORCIBLE ENTRY",Friday,2014-04-18,17:00,INGLESIDE,NONE,...,5,7,2014-04-18 17:00:00+07:00,12.790000,76.0,scattered clouds,9.0,280.0,1012.0,17
4,14047274305014,140472743,5014,BURGLARY,"BURGLARY, VEHICLE (ARREST MADE)",Friday,2014-06-06,20:10,BAYVIEW,"ARREST, BOOKED",...,5,4,2014-06-06 20:00:00+07:00,12.370000,94.0,mist,1.0,225.0,1009.0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5063,16040921510149,160409215,10149,FORGERY/COUNTERFEITING,"MONEY, CHANGING FACE AMOUNT",Thursday,2016-05-19,10:04,SOUTHERN,UNFOUNDED,...,4,2,2016-05-19 10:00:00+07:00,28.290000,84.0,scattered clouds,3.0,350.0,1009.0,17
5064,13004642709015,130046427,9015,FORGERY/COUNTERFEITING,"CHECKS, FORGERY (FELONY)",Monday,2012-11-19,18:01,SOUTHERN,NONE,...,1,2,2012-11-19 18:00:00+07:00,12.040000,82.0,mist,5.0,250.0,1017.0,11
5065,13025930710149,130259307,10149,FORGERY/COUNTERFEITING,"MONEY, CHANGING FACE AMOUNT",Friday,2013-03-29,16:20,MISSION,"ARREST, BOOKED",...,5,8,2013-03-29 16:00:00+07:00,12.200000,100.0,fog,2.0,190.0,1017.0,3
5066,14110100209020,141101002,9020,FORGERY/COUNTERFEITING,"CHECKS, MAKE OR PASS FICTITIOUS",Wednesday,2014-12-31,16:55,NORTHERN,NONE,...,3,6,2014-12-31 17:00:00+07:00,8.090333,96.0,sky is clear,9.0,20.0,1039.0,18


In [73]:
features3 = Crimes_Weather.loc[:, ['Month', 'Time_numeric', 'DayOfWeek_numeric', 'PdDistrict_num', 'Weather_numerical']].copy()
targets3 = Crimes_Weather.loc[:, ['Category']].copy()

X_train, X_test, y_train, y_test = train_test_split(features3, targets3, test_size = 0.3)

clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [74]:
evaluate(y_test, y_pred)

0.6850756081525312