In [1]:
# System modules
import sys
import random
import time
import datetime 

# Data Analysis and Modeling modules
import sklearn
import pandas as pd
import numpy as np
import scipy as sp

In [2]:
train_data_raw = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [139]:
# Create a copy of the data to work on
train_data = train_data_raw.copy(deep=True)
data_cleaner = [train_data, test_data]

In [141]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [142]:
# convert 'Dates' to datetime object
for dataset in data_cleaner:
    dataset['Datetime_Dates'] = dataset['Dates'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    print('finished one')

finished one
finished one


In [6]:
print(train_data['Datetime_Dates'].head())
print(train_data['Datetime_Dates'][0].year)

0   2015-05-13 23:53:00
1   2015-05-13 23:53:00
2   2015-05-13 23:33:00
3   2015-05-13 23:30:00
4   2015-05-13 23:30:00
Name: Datetime_Dates, dtype: datetime64[ns]
2015


In [143]:
# check if workday, holiday, daytime
for dataset in data_cleaner:
    dataset['Workhour'] = dataset['Datetime_Dates'].map(lambda x: 1 if x.hour in range(9, 17) else 0)

In [144]:
print(train_data[['Datetime_Dates','Workhour']].head())
print(train_data['Workhour'].unique())
print(train_data['Workhour'].describe())

       Datetime_Dates  Workhour
0 2015-05-13 23:53:00         0
1 2015-05-13 23:53:00         0
2 2015-05-13 23:33:00         0
3 2015-05-13 23:30:00         0
4 2015-05-13 23:30:00         0
[0 1]
count    878049.000000
mean          0.397964
std           0.489478
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: Workhour, dtype: float64


In [145]:
# get a list of us federal holidays as a datetime object
from pandas.tseries.holiday import USFederalHolidayCalendar
days_off = USFederalHolidayCalendar().holidays(start='2003-01-01', end='2015-05-31').to_pydatetime()

In [146]:
for dataset in data_cleaner:
    # check if a datetime object is in the days_off list
    dataset['isHoliday'] = dataset['Datetime_Dates'].map(lambda x : 1 if x in days_off else 0)
    print('finished one')

finished one
finished one


In [147]:
print(train_data[['Datetime_Dates','Workhour']].head())
print(train_data['Workhour'].unique())
print(train_data['Workhour'].describe())

       Datetime_Dates  Workhour
0 2015-05-13 23:53:00         0
1 2015-05-13 23:53:00         0
2 2015-05-13 23:33:00         0
3 2015-05-13 23:30:00         0
4 2015-05-13 23:30:00         0
[0 1]
count    878049.000000
mean          0.397964
std           0.489478
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: Workhour, dtype: float64


In [148]:
for dataset in data_cleaner:
    # create a column for month
    dataset['Month'] = dataset['Datetime_Dates'].map(lambda x : x.month)
    print('finished one')

finished one
finished one


In [45]:
print(train_data['Month'].unique())

[ 5  4  3  2  1 12 11 10  9  8  7  6]


In [149]:
for dataset in data_cleaner:
    # summer: check if month is between june and august (6-8)
    dataset['isSummer'] = dataset['Month'].map(lambda x : 1 if x == 6 or x == 7 or x == 8 else 0)
    # winter: check if month is between dec and feb (12-2)
    dataset['isWinter'] = dataset['Month'].map(lambda x : 1 if x == 12 or x == 1 or x == 2 else 0)
    # autumn: check if month is between sept and nov (9-11)
    dataset['isAutumn'] = dataset['Month'].map(lambda x : 1 if x == 9 or x == 10 or x == 11 else 0)
    # spring: check if month is between march and may (3-5)
    dataset['isSpring'] = dataset['Month'].map(lambda x : 1 if x == 3 or x == 4 or x == 5 else 0)
    print('finished one')

finished one
finished one


In [151]:
seasons = ['isSummer', 'isAutumn', 'isWinter', 'isSpring']
print(train_data[seasons].describe())

            isSummer       isAutumn       isWinter       isSpring
count  878049.000000  878049.000000  878049.000000  878049.000000
mean        0.238487       0.256513       0.238432       0.266568
std         0.426158       0.436708       0.426125       0.442165
min         0.000000       0.000000       0.000000       0.000000
25%         0.000000       0.000000       0.000000       0.000000
50%         0.000000       0.000000       0.000000       0.000000
75%         0.000000       1.000000       0.000000       1.000000
max         1.000000       1.000000       1.000000       1.000000


In [152]:
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 17 columns):
Dates             878049 non-null object
Category          878049 non-null object
Descript          878049 non-null object
DayOfWeek         878049 non-null object
PdDistrict        878049 non-null object
Resolution        878049 non-null object
Address           878049 non-null object
X                 878049 non-null float64
Y                 878049 non-null float64
Datetime_Dates    878049 non-null datetime64[ns]
Workhour          878049 non-null int64
isHoliday         878049 non-null int64
Month             878049 non-null int64
isSummer          878049 non-null int64
isWinter          878049 non-null int64
isAutumn          878049 non-null int64
isSpring          878049 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(7), object(7)
memory usage: 113.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 32 

In [153]:
print(train_data['PdDistrict'].head())
print(train_data['PdDistrict'].unique())

0    NORTHERN
1    NORTHERN
2    NORTHERN
3    NORTHERN
4        PARK
Name: PdDistrict, dtype: object
['NORTHERN' 'PARK' 'INGLESIDE' 'BAYVIEW' 'RICHMOND' 'CENTRAL' 'TARAVAL'
 'TENDERLOIN' 'MISSION' 'SOUTHERN']


In [173]:
# get dummy data for pd_district and concat it to test/train dataframes
train_data = pd.concat([train_data, pd.get_dummies(train_data['PdDistrict'])], axis=1)
test_data = pd.concat([test_data, pd.get_dummies(test_data['PdDistrict'])], axis=1)
# for dataset in data_cleaner:
#     dataset = pd.concat([dataset, pd.get_dummies(dataset['PdDistrict'])], axis=1)

In [175]:
pd_districts = train_data['PdDistrict'].unique()
print(train_data.columns.tolist())
print(train_data[pd_districts].head())

['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y', 'Datetime_Dates', 'Workhour', 'isHoliday', 'Month', 'isSummer', 'isWinter', 'isAutumn', 'isSpring', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
   NORTHERN  PARK  INGLESIDE  BAYVIEW  RICHMOND  CENTRAL  TARAVAL  TENDERLOIN  \
0         1     0          0        0         0        0        0           0   
1         1     0          0        0         0        0        0           0   
2         1     0          0        0         0        0        0           0   
3         1     0          0        0         0        0        0           0   
4         0     1          0        0         0        0        0           0   

   MISSION  SOUTHERN  
0        0         0  
1        0         0  
2        0         0  
3        0         0  
4        0         0  


In [176]:
# get train_test data for DayOfWeek for train/test data
# for dataset in data_cleaner:
#     dataset = pd.concat([dataset, pd.get_dummies(dataset['DayOfWeek'])], axis=1)
train_data = pd.concat([train_data, pd.get_dummies(train_data['DayOfWeek'])], axis=1)
test_data = pd.concat([test_data, pd.get_dummies(test_data['DayOfWeek'])], axis=1)

In [177]:
week_day = test_data['DayOfWeek'].unique()
print(test_data.columns.tolist())
print(test_data[week_day].head())

['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'Datetime_Dates', 'Workhour', 'isHoliday', 'Month', 'isSummer', 'isWinter', 'isAutumn', 'isSpring', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
   Sunday  Sunday  Saturday  Saturday  Friday  Friday  Thursday  Thursday  \
0       1       1         0         0       0       0         0         0   
1       1       1         0         0       0       0         0         0   
2       1       1         0         0       0       0         0         0   
3       1       1         0         0       0       0         0         0   
4       1       1         0         0       0       0         0  

In [178]:
data_cleaner = [train_data, test_data]

In [181]:
print(train_data[pd_districts].describe())
print(train_data[week_day].describe())

            NORTHERN           PARK      INGLESIDE        BAYVIEW  \
count  878049.000000  878049.000000  878049.000000  878049.000000   
mean        0.119920       0.056162       0.089796       0.101852   
std         0.324869       0.230234       0.285889       0.302454   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

            RICHMOND        CENTRAL        TARAVAL     TENDERLOIN  \
count  878049.000000  878049.000000  878049.000000  878049.000000   
mean        0.051488       0.097329       0.074707       0.093171   
std         0.220991       0.296406       0.262917       0.290673   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.0000

In [185]:
# make the attributes form Datetime field into int column fields
for dataset in data_cleaner:
    # create a column for Year
    dataset['Year'] = dataset['Datetime_Dates'].map(lambda x : x.year)
    # create a column for Day
    dataset['Day'] = dataset['Datetime_Dates'].map(lambda x : x.day)
    # create a column for Hour
    dataset['Hour'] = dataset['Datetime_Dates'].map(lambda x : x.hour)
    print('finished one')

finished one
finished one


In [184]:
print(train_data['Year'])

0         2015
1         2015
2         2015
3         2015
4         2015
5         2015
6         2015
7         2015
8         2015
9         2015
10        2015
11        2015
12        2015
13        2015
14        2015
15        2015
16        2015
17        2015
18        2015
19        2015
20        2015
21        2015
22        2015
23        2015
24        2015
25        2015
26        2015
27        2015
28        2015
29        2015
          ... 
878019    2003
878020    2003
878021    2003
878022    2003
878023    2003
878024    2003
878025    2003
878026    2003
878027    2003
878028    2003
878029    2003
878030    2003
878031    2003
878032    2003
878033    2003
878034    2003
878035    2003
878036    2003
878037    2003
878038    2003
878039    2003
878040    2003
878041    2003
878042    2003
878043    2003
878044    2003
878045    2003
878046    2003
878047    2003
878048    2003
Name: Year, Length: 878049, dtype: int64


# Send Data for Data Visualization and Modeling

In [187]:
# processed data to csv file done here
train_data.to_csv('./data/processed_train_data.csv')
test_data.to_csv('./data/processed_test_data.csv')