In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
train_df = pd.read_csv('../input/train.csv', parse_dates=['Dates'])

In [3]:
test_df = pd.read_csv('../input/test.csv', parse_dates=['Dates'])

In [4]:
train_df.sample(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
263249,2011-10-20 21:50:00,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Thursday,TENDERLOIN,"ARREST, BOOKED",TURK ST / TAYLOR ST,-122.410769,37.783215
508101,2008-03-18 08:15:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Tuesday,TARAVAL,NONE,RIVERA ST / 37TH AV,-122.495239,37.745926
307277,2011-02-21 23:30:00,ASSAULT,"BATTERY, FORMER SPOUSE OR DATING RELATIONSHIP",Monday,TARAVAL,"ARREST, BOOKED",1600 Block of IRVING ST,-122.475526,37.763577


In [5]:
test_df.sample(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
153614,153614,2013-05-09 10:50:00,Thursday,BAYVIEW,3900 Block of 3RD ST,-122.388476,37.741522
485970,485970,2008-07-19 12:07:00,Saturday,TENDERLOIN,LARKIN ST / GROVE ST,-122.416583,37.778659
221240,221240,2012-06-06 11:00:00,Wednesday,MISSION,500 Block of CASTRO ST,-122.434995,37.76007


In [6]:
def engineer_dates_col(df):
    dates_col   = df['Dates']
    dates_col   = pd.to_datetime(dates_col)
    df['Year']  = dates_col.dt.year
    df['Year'] = df['Year'] - 2000
    df['Month'] = dates_col.dt.month
    df['Hour'] = dates_col.dt.hour
    df['IsDay'] = dates_col.dt.hour.apply(lambda h: 1 if (h > 6 and h < 20) else 0)
    df['HourSin'] = np.sin((df['Hour']*2*np.pi)/24)
    df['HourCos'] = np.cos((df['Hour']*2*np.pi)/24)
    df['MonthSin'] = np.sin((df['Month']*2*np.pi)/12)
    df['MonthCos'] = np.cos((df['Month']*2*np.pi)/12)
    return df

In [7]:
def engineer_dayofweek_col(df):
    days_int = df['DayOfWeek'].map({
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7,
    })
    df['DayOfWeekSin']= np.sin((days_int*2*np.pi)/7)
    df['DayOfWeekCos']= np.cos((days_int*2*np.pi)/7)
    return df

In [8]:
def engineer_category_col(df):
    le = LabelEncoder()
    df['Category'] = pd.Series(le.fit_transform(df['Category']))
    return df

In [9]:
def engineer_pddistrict_col(df):
    #le = LabelEncoder()
    #df['PdDistrict'] = pd.Series(le.fit_transform(df['PdDistrict']))
    return pd.get_dummies(df, columns=['PdDistrict'])

In [10]:
def engineer_all_cols(df, encode_category=False):
    df = engineer_dates_col(df)
    df = engineer_dayofweek_col(df)
    df = engineer_pddistrict_col(df)
    if encode_category:
        df = engineer_category_col(df)
    return df

In [11]:
train_df = engineer_all_cols(train_df, encode_category=True)
test_df  = engineer_all_cols(test_df)

In [12]:
train_df.sample(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,Resolution,Address,X,Y,Year,Month,Hour,IsDay,HourSin,HourCos,MonthSin,MonthCos,DayOfWeekSin,DayOfWeekCos,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
866435,2003-02-23 16:00:00,4,"BURGLARY OF HOTEL ROOM, FORCIBLE ENTRY",Sunday,NONE,0 Block of 6TH ST,-122.40942,37.781615,3,2,16,1,-0.866025,-0.5,0.866025,0.5,-2.449294e-16,1.0,0,0,0,0,0,0,0,1,0,0
228486,2012-04-21 01:00:00,20,LOST PROPERTY,Saturday,NONE,300 Block of 11TH ST,-122.413189,37.771274,12,4,1,0,0.258819,0.965926,0.866025,-0.5,-0.7818315,0.62349,0,0,0,0,0,0,0,1,0,0
707528,2005-05-05 10:54:00,20,AIDED CASE,Thursday,NONE,200 Block of SANMARCOS AV,-122.466395,37.746551,5,5,10,1,0.5,-0.866025,0.5,-0.866025,-0.4338837,-0.900969,0,0,0,0,0,0,0,0,1,0


In [13]:
test_df.sample(3)

Unnamed: 0,Id,Dates,DayOfWeek,Address,X,Y,Year,Month,Hour,IsDay,HourSin,HourCos,MonthSin,MonthCos,DayOfWeekSin,DayOfWeekCos,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
103792,103792,2014-01-01 00:05:00,Wednesday,300 Block of POWELL ST,-122.408384,37.787827,14,1,0,0,0.0,1.0,0.5,0.8660254,0.433884,-0.900969,0,1,0,0,0,0,0,0,0,0
86717,86717,2014-03-25 08:00:00,Tuesday,WILDWOOD WY / EASTWOOD DR,-122.45738,37.727588,14,3,8,1,0.866025,-0.5,1.0,6.123234000000001e-17,0.974928,-0.222521,0,0,1,0,0,0,0,0,0,0
553096,553096,2007-08-04 16:30:00,Saturday,800 Block of BRYANT ST,-122.403405,37.775421,7,8,16,1,-0.866025,-0.5,-0.866025,-0.5,-0.781831,0.62349,0,0,0,0,0,0,0,1,0,0


In [14]:
OUTPUT = True
if OUTPUT:
    train_df.to_csv('./train_clean.csv', index=False)
    test_df.to_csv('./test_clean.csv', index=False)
    print('Done Outputing !')

Done Outputing !
