## Machine Learning Project: Crime prediction in Chicago
### Approach 2: Naive Bayes
#### The problem we propose is to predict future occurrences of crime by identifying times and locations for different crime types for Chicago. 

### Import Chicago crime location dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from sklearn.model_selection import KFold
%matplotlib inline

In [3]:
inputfile = 'Crimes_-_2008_to_present.csv' ## Full crime dataset from 2008

In [4]:
columns_to_use = ['Date', 'District', 'X Coordinate', 'Y Coordinate', 'Latitude', 'Longitude', 'Community Area'] ## The GPS coordinates seem to be suited enough

data = pd.read_csv(inputfile, sep=',', encoding='utf8', usecols=columns_to_use)
data = data[np.isfinite(data['District'])]
data.columns

Index(['Date', 'District', 'Community Area', 'X Coordinate', 'Y Coordinate',
       'Latitude', 'Longitude'],
      dtype='object')

In [5]:
data.head()

Unnamed: 0,Date,District,Community Area,X Coordinate,Y Coordinate,Latitude,Longitude
0,08/15/2016 04:10:55 PM,11.0,26.0,,,,
1,08/19/2016 08:14:00 PM,11.0,26.0,,,,
2,07/29/2016 01:46:00 PM,7.0,67.0,,,,
3,01/04/2017 12:26:00 AM,25.0,25.0,,,,
4,01/04/2017 12:15:00 AM,4.0,46.0,,,,


### Application of Naive Bayes 

In [2]:
train = pd.read_csv('C:\Users\jyots\Music\NovoEd\CRIME_DATA.csv') #Chicago crime data for past 1yr
HI = pd.read_csv('C:\Users\jyots\Music\NovoEd\Book1.csv') #CSV with community area number with its corresponding HI

In [5]:
train = train.merge(HI,on='Community Area',how="inner")
train.head()
kf = KFold(n_splits=2)
train,test =kf.get_n_splits(train)

Unnamed: 0,Date,Primary Type,Community Area,HI
0,1/4/2017 5:30,MOTOR VEHICLE THEFT,1,39
1,1/4/2017 13:14,CRIMINAL TRESPASS,1,39
2,1/4/2017 15:23,BATTERY,1,39
3,1/4/2017 18:30,BATTERY,1,39
4,1/4/2017 20:03,MOTOR VEHICLE THEFT,1,39


In [6]:
category = train['Primary Type']
prior_prob=category.value_counts(normalize=True)
#prior_prob= pd.DataFrame(prior_prob,columns=['Proir_Prob'])

In [7]:
prior_prob= pd.DataFrame(prior_prob)
prior_prob.columns=['Prior_probs']

In [19]:
train['Hour']= pd.to_datetime(train['Date']).apply(lambda x: x.hour)

In [22]:
train['Day']= pd.to_datetime(train['Date']).apply(lambda x: x.weekday)
train['Day']= pd.to_datetime(train['Date']).apply(lambda x: x.dayofweek)
train['month']= pd.to_datetime(train['Date']).apply(lambda x: x.month

In [12]:
# Likelihood calculations
likelihood_hour= pd.crosstab(train['Primary Type'],train.Hour).apply(lambda x: x/x.sum(),axis=1)
likelihood_HI = pd.crosstab(train['Primary Type'],train.HI).apply(lambda x: x/x.sum(),axis=1)
likelihood_CAN= pd.crosstab(train['Primary Type'],train['Community Area']).apply(lambda x: x/x.sum(),axis=1)

In [13]:
def classify(row):
    hour= row['Hour']
    HI= row['HI']
    CAN = row['Community Area']
    month= row['month']
    Day = row['Day']
    
    likelihood_hour_sub = likelihood_hour[hour]
    likelihood_HI_sub = likelihood_HI[HI]
    likelihood_CAN_sub = likelihood_CAN[CAN]
    likelihood_month_sub = likelihood_month[month]
    likelihood_day_sub = likelihood_Day[Day]
    
    #probabilities
    probabilities = prior_prob.join(likelihood_hour_sub,how="inner").join(likelihood_HI_sub,how="inner").join(likelihood_CAN_sub,how="inner").join(likelihood_day_sub,how="inner").join(likelihood_month_sub,how="inner")
    probabilities['Probability']= probabilities.prod(axis=1)
    probabilities = probabilities.sort('Probability',ascending=False)
    
    class_label= probabilities.index.values[0]
    return(class_label)

In [28]:
from numpy import random
n= random.randint(0,(len(train)-1000))
test = train[n:n+1000]

In [29]:
test['ClassLabel'] = test.apply(lambda x: classify(x),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [30]:
test.head()

Unnamed: 0,Date,Primary Type,Community Area,HI,Hour,ClassLabel
177144,11/11/2016 15:30,THEFT,51,65,15,THEFT
177145,11/11/2016 15:30,THEFT,51,65,15,THEFT
177146,11/11/2016 15:30,THEFT,51,65,15,THEFT
177147,10/29/2016 9:30,OFFENSE INVOLVING CHILDREN,51,65,9,OTHER OFFENSE
177148,10/29/2016 9:30,OFFENSE INVOLVING CHILDREN,51,65,9,OTHER OFFENSE


In [31]:
results = pd.crosstab(test.index,(test['ClassLabel']))
results = pd.DataFrame(results)
results.head()

ClassLabel,BATTERY,CRIMINAL DAMAGE,OTHER OFFENSE,THEFT
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
177144,0,0,0,1
177145,0,0,0,1
177146,0,0,0,1
177147,0,0,1,0
177148,0,0,1,0


In [32]:
category= prior_prob.index.values
predictions= pd.DataFrame(index=test.index,columns=category)
predictions= predictions.fillna(0)
predictions= (predictions+results).fillna(0)

In [None]:
#predictions.to_csv('C:\Users\jyots\Music/predictions.csv')

In [33]:
predictions.head()

Unnamed: 0,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,GAMBLING,...,OFFENSE INVOLVING CHILDREN,OTHER NARCOTIC VIOLATION,OTHER OFFENSE,PROSTITUTION,PUBLIC PEACE VIOLATION,ROBBERY,SEX OFFENSE,STALKING,THEFT,WEAPONS VIOLATION
177144,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0
177145,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0
177146,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0
177147,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0
177148,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0


In [46]:
def getAccuracy(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test.iloc[i][-1] == test.iloc[i][1]:
            correct += 1
    return (correct/float(len(test))) * 100.0

accuracy = getAccuracy(test, predictions)
print(accuracy)

25.3
