# Crime classification in San Francisco

In [12]:
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [13]:
#Read training data

train_data = pd.read_csv('data/train.csv')
print(train_data.shape)
print('data loaded')

(878049, 9)
data loaded


In [14]:
#Get categories (output to find)

cat = train_data.Category.unique()
print(cat)
list_cat=np.copy(cat)
np.save('categories.npy', list_cat) 
train_data.drop(['Descript','Resolution','Address'],inplace =True, axis=1)  #data useless, unsignificant, estimated useless
train_data.drop(train_data[(train_data.Y>60)|(train_data.X>-122)].index, inplace=True)  #outliers
train_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)


train_data.head()

['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' 'VANDALISM'
 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'
 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'
 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'
 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'
 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'
 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'
 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'
 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT']


Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,2,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,2,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,2,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,2,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,2,PARK,-122.438738,37.771541


In [4]:
#Format time data

train_data.Dates = pd.to_datetime(train_data.Dates)
train_data['Time'] = train_data.Dates.dt.hour*3600 + train_data.Dates.dt.minute*60 + train_data.Dates.dt.second
train_data['Hour'] = train_data.Dates.dt.hour
train_data['Day'] = train_data.Dates.dt.day
train_data['Month']=train_data.Dates.dt.month
train_data['Year']=train_data.Dates.dt.year
for key in ['Time','Hour','Day','Month','Year']:
    train_data[key]=pd.to_numeric(train_data[key])


train_data['Category'].replace(to_replace=cat,value=[i for i in range(len(cat))],inplace=True)
train_data.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y,Time,Hour,Day,Month,Year
0,2015-05-13 23:53:00,0,2,NORTHERN,-122.425892,37.774599,85980,23,13,5,2015
1,2015-05-13 23:53:00,1,2,NORTHERN,-122.425892,37.774599,85980,23,13,5,2015
2,2015-05-13 23:33:00,1,2,NORTHERN,-122.424363,37.800414,84780,23,13,5,2015
3,2015-05-13 23:30:00,2,2,NORTHERN,-122.426995,37.800873,84600,23,13,5,2015
4,2015-05-13 23:30:00,2,2,PARK,-122.438738,37.771541,84600,23,13,5,2015


In [5]:
#Cell to run if you want to have year normalized between -1 and 1
def normalize_year(data):
    years = data.Dates.dt.year.unique()
    years = sorted(years)
    a = 2/(years[-1] - years[0])
    b = -a*(years[-1]+years[0])/2
    data['Year'].replace(to_replace=years,value=[a*year +b for year in years],inplace=True)

normalize_year(train_data)
train_data.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y,Time,Hour,Day,Month,Year
0,2015-05-13 23:53:00,0,2,NORTHERN,-122.425892,37.774599,85980,23,13,5,1.0
1,2015-05-13 23:53:00,1,2,NORTHERN,-122.425892,37.774599,85980,23,13,5,1.0
2,2015-05-13 23:33:00,1,2,NORTHERN,-122.424363,37.800414,84780,23,13,5,1.0
3,2015-05-13 23:30:00,2,2,NORTHERN,-122.426995,37.800873,84600,23,13,5,1.0
4,2015-05-13 23:30:00,2,2,PARK,-122.438738,37.771541,84600,23,13,5,1.0


In [6]:
def District_barycenters(data):
    Districts = data['PdDistrict'].unique()
    Bary_coord = {}
    for i in range(len(Districts)):
        test = data[['PdDistrict','X','Y']]
        district = test.loc[test['PdDistrict']==Districts[i],:]
        abs = district['X']
        ord = district['Y']
        mean_abs = abs.mean()
        mean_ord = ord.mean()
        Bary_coord[Districts[i]] = (mean_abs,mean_ord)
    data['X_Bar_District'] = data['PdDistrict']
    data['Y_Bar_District'] = data['PdDistrict']
    data.drop('PdDistrict', inplace = True, axis=1)
    data['X_Bar_District'].replace(to_replace=Districts,value=[Bary_coord[Districts[i]][0] for i in range(len(Districts))],                   inplace=True)
    data['Y_Bar_District'].replace(to_replace=Districts,value=[Bary_coord[Districts[i]][1] for i in range(len(Districts))],                   inplace=True)
District_barycenters(train_data)
train_data.head()

Unnamed: 0,Dates,Category,DayOfWeek,X,Y,Time,Hour,Day,Month,Year,X_Bar_District,Y_Bar_District
0,2015-05-13 23:53:00,0,2,-122.425892,37.774599,85980,23,13,5,1.0,-122.426647,37.786379
1,2015-05-13 23:53:00,1,2,-122.425892,37.774599,85980,23,13,5,1.0,-122.426647,37.786379
2,2015-05-13 23:33:00,1,2,-122.424363,37.800414,84780,23,13,5,1.0,-122.426647,37.786379
3,2015-05-13 23:30:00,2,2,-122.426995,37.800873,84600,23,13,5,1.0,-122.426647,37.786379
4,2015-05-13 23:30:00,2,2,-122.438738,37.771541,84600,23,13,5,1.0,-122.445448,37.770299


In [7]:
def percentage_per_category(data):
    percentage_per_category = np.zeros(39)
    for c in range(39):
        #compute the percentage of category c in data
        percentage_per_category[c] = data.loc[data['Category']==c,:].count().unique()/len(data) 
    return percentage_per_category

In [8]:
# this function enables to add data in the sample in order to minimize (less than epsilon = 1e-3 here) the gap between the proportion of
# each category in the real data (df) and in the sample (dfsample)
def rebuild_data(data,sample, epsilon = 1e-3):
    real_percentage = percentage_per_category(data)
    proportion = percentage_per_category(sample)
    category = np.sort(data['Category'].unique())
    N = len(category)
    for i in range(N):
        m = category[i]
        incomplete = True
        error = np.abs(real_percentage[m] - proportion[m])
        while incomplete:
            data_cat = data.loc[data['Category'] == m,:]
            n = len(data_cat)
            index = np.random.randint(1,n)
            new_line = data_cat[index:index+1]
            sample = sample.append(new_line,ignore_index = True)
            proportion = percentage_per_category(sample)
            incomplete = real_percentage[m] - proportion[m] > epsilon
            error = np.abs(real_percentage[m] - proportion[m])
    return sample

In [9]:
#Cell to run if you want to have cyclic year

train_data['Sin_Year'] = np.sin(2 * np.pi * train_data['Time'] / (24*60*60))  # cycle rpz to explore 
train_data['Cos_Year'] = np.cos(2 * np.pi * train_data['Time'] / (24*60*60))

train_data['Sin_Hour'] = np.sin(2 * np.pi * train_data['Hour'] / 24)  # cycle rpz to explore 
train_data['Cos_Hour'] = np.cos(2 * np.pi * train_data['Hour'] / 24)

train_data['Sin_Day_m'] = np.sin(2 * np.pi * train_data['Day'] / 31)  # cycle rpz to explore 
train_data['Cos_Day_m'] = np.cos(2 * np.pi * train_data['Day'] / 31) 

train_data['Sin_Month'] = np.sin(2 * np.pi * train_data['Month'] / 12)  # cycle rpz to explore 
train_data['Cos_Month'] = np.cos(2 * np.pi * train_data['Month'] / 12) 

train_data['Sin_Day_w'] = np.sin(2 * np.pi * train_data['DayOfWeek'] / 7)  # cycle rpz to explore 
train_data['Cos_Day_w'] = np.cos(2 * np.pi * train_data['DayOfWeek'] / 7) 

train_data.drop(['Dates','Time','Hour','Day','Month','DayOfWeek'],inplace =True, axis=1)

train_data.head()

Unnamed: 0,Category,X,Y,Year,X_Bar_District,Y_Bar_District,Sin_Year,Cos_Year,Sin_Hour,Cos_Hour,Sin_Day_m,Cos_Day_m,Sin_Month,Cos_Month,Sin_Day_w,Cos_Day_w
0,0,-122.425892,37.774599,1.0,-122.426647,37.786379,-0.030539,0.999534,-0.258819,0.965926,0.485302,-0.874347,0.5,-0.866025,0.974928,-0.222521
1,1,-122.425892,37.774599,1.0,-122.426647,37.786379,-0.030539,0.999534,-0.258819,0.965926,0.485302,-0.874347,0.5,-0.866025,0.974928,-0.222521
2,1,-122.424363,37.800414,1.0,-122.426647,37.786379,-0.117537,0.993068,-0.258819,0.965926,0.485302,-0.874347,0.5,-0.866025,0.974928,-0.222521
3,2,-122.426995,37.800873,1.0,-122.426647,37.786379,-0.130526,0.991445,-0.258819,0.965926,0.485302,-0.874347,0.5,-0.866025,0.974928,-0.222521
4,2,-122.438738,37.771541,1.0,-122.445448,37.770299,-0.130526,0.991445,-0.258819,0.965926,0.485302,-0.874347,0.5,-0.866025,0.974928,-0.222521


In [10]:
test_data = pd.read_csv('data/test.csv')
print('test data loaded')
test_data.drop(['Address'],inplace =True, axis=1)  #data useless, unsignificant
test_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)

test_data.Dates = pd.to_datetime(test_data.Dates)
test_data['Time'] = test_data.Dates.dt.hour*3600 + test_data.Dates.dt.minute*60 + test_data.Dates.dt.second
test_data['Hour'] = test_data.Dates.dt.hour
test_data['Day'] = test_data.Dates.dt.day
test_data['Month']=test_data.Dates.dt.month
test_data['Year']=test_data.Dates.dt.year
for key in ['Time','Hour','Day','Month','Year']:
    test_data[key]=pd.to_numeric(test_data[key])
normalize_year(test_data)
District_barycenters(test_data)

test_data['Sin_Time'] = np.sin(2 * np.pi * test_data['Time'] / (24*60*60))  # cycle rpz to explore 
test_data['Cos_Time'] = np.cos(2 * np.pi * test_data['Time'] / (24*60*60))

test_data['Sin_Hour'] = np.sin(2 * np.pi * test_data['Hour'] / 24)  # cycle rpz to explore 
test_data['Cos_Hour'] = np.cos(2 * np.pi * test_data['Hour'] / 24)

test_data['Sin_Day_m'] = np.sin(2 * np.pi * test_data['Day'] / 31)  # cycle rpz to explore 
test_data['Cos_Day_m'] = np.cos(2 * np.pi * test_data['Day'] / 31) 

test_data['Sin_Month'] = np.sin(2 * np.pi * test_data['Month'] / 12)  # cycle rpz to explore 
test_data['Cos_Month'] = np.cos(2 * np.pi * test_data['Month'] / 12) 

test_data['Sin_Day_w'] = np.sin(2 * np.pi * test_data['DayOfWeek'] / 7)  # cycle rpz to explore 
test_data['Cos_Day_w'] = np.cos(2 * np.pi * test_data['DayOfWeek'] / 7) 

test_data.drop(['Id','Time','Hour','Day','Month','DayOfWeek','Dates'],inplace =True, axis=1)

test_data.head()

test data loaded


Unnamed: 0,X,Y,Year,X_Bar_District,Y_Bar_District,Sin_Time,Cos_Time,Sin_Hour,Cos_Hour,Sin_Day_m,Cos_Day_m,Sin_Month,Cos_Month,Sin_Day_w,Cos_Day_w
0,-122.399588,37.735051,1.0,-122.393457,37.740094,-0.004363,0.99999,-0.258819,0.965926,0.897805,-0.440394,0.5,-0.866025,-0.781831,0.62349
1,-122.391523,37.732432,1.0,-122.393457,37.740094,-0.03926,0.999229,-0.258819,0.965926,0.897805,-0.440394,0.5,-0.866025,-0.781831,0.62349
2,-122.426002,37.792212,1.0,-122.426336,37.795198,-0.043619,0.999048,-0.258819,0.965926,0.897805,-0.440394,0.5,-0.866025,-0.781831,0.62349
3,-122.437394,37.721412,1.0,-122.428722,37.728411,-0.065403,0.997859,-0.258819,0.965926,0.897805,-0.440394,0.5,-0.866025,-0.781831,0.62349
4,-122.437394,37.721412,1.0,-122.428722,37.728411,-0.065403,0.997859,-0.258819,0.965926,0.897805,-0.440394,0.5,-0.866025,-0.781831,0.62349


In [11]:
train_data.to_csv("pre_processing_train_data.csv")
test_data.to_csv("pre_processing_test_data.csv")