In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import os
import re
import lightgbm as lgb
import eli5
from eli5.sklearn import PermutationImportance
from lightgbm import LGBMClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['sf_map_copyright_openstreetmap_contributors.txt', 'train.csv.zip', 'sf_map_copyright_openstreetmap_contributors.rds', 'test.csv.zip', 'test.csv', 'sampleSubmission.csv', 'train.csv']


In [2]:
train = pd.read_csv('../input/train.csv', parse_dates=['Dates'])
test = pd.read_csv('../input/test.csv', parse_dates=['Dates'], index_col='Id')

In [3]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
test.head()

Unnamed: 0_level_0,Dates,DayOfWeek,PdDistrict,Address,X,Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null datetime64[ns]
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 60.3+ MB


In [6]:
train.isnull().sum()

Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

In [7]:
def feature_engineering(data):
    data['Date'] = pd.to_datetime(data['Dates'].dt.date)
    data['n_days'] = (data['Date'] - data['Date'].min()).apply(lambda x: x.days)
    data['Day'] = data['Dates'].dt.day
    data['DayOfWeek'] = data['Dates'].dt.weekday
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    data['Block'] = data['Address'].str.contains('block', case=False)
    data.drop(columns=['Dates','Date','Address'], inplace=True)
    return data

In [8]:
train = feature_engineering(train)
test = feature_engineering(test)

In [9]:
train.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,X,Y,n_days,Day,Month,Year,Hour,Minute,Block
0,WARRANTS,WARRANT ARREST,2,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,4510,13,5,2015,23,53,False
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,2,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,4510,13,5,2015,23,53,False
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,2,NORTHERN,"ARREST, BOOKED",-122.424363,37.800414,4510,13,5,2015,23,33,False
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,2,NORTHERN,NONE,-122.426995,37.800873,4510,13,5,2015,23,30,True
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,2,PARK,NONE,-122.438738,37.771541,4510,13,5,2015,23,30,True


In [10]:
train.shape

(878049, 14)

In [11]:
test.head()

Unnamed: 0_level_0,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,6,BAYVIEW,-122.399588,37.735051,4512,10,5,2015,23,59,True
1,6,BAYVIEW,-122.391523,37.732432,4512,10,5,2015,23,51,False
2,6,NORTHERN,-122.426002,37.792212,4512,10,5,2015,23,50,True
3,6,INGLESIDE,-122.437394,37.721412,4512,10,5,2015,23,45,True
4,6,INGLESIDE,-122.437394,37.721412,4512,10,5,2015,23,45,True


In [12]:
le1 = LabelEncoder()
train['PdDistrict'] = le1.fit_transform(train['PdDistrict'])
test['PdDistrict'] = le1.transform(test['PdDistrict'])

In [13]:
train.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,X,Y,n_days,Day,Month,Year,Hour,Minute,Block
0,WARRANTS,WARRANT ARREST,2,4,"ARREST, BOOKED",-122.425892,37.774599,4510,13,5,2015,23,53,False
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,2,4,"ARREST, BOOKED",-122.425892,37.774599,4510,13,5,2015,23,53,False
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,2,4,"ARREST, BOOKED",-122.424363,37.800414,4510,13,5,2015,23,33,False
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,2,4,NONE,-122.426995,37.800873,4510,13,5,2015,23,30,True
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,2,5,NONE,-122.438738,37.771541,4510,13,5,2015,23,30,True


In [14]:
train['PdDistrict'].value_counts()

7    157182
3    119908
4    105296
0     89431
1     85460
9     81809
2     78845
8     65596
5     49313
6     45209
Name: PdDistrict, dtype: int64

In [15]:
test.head()

Unnamed: 0_level_0,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,6,0,-122.399588,37.735051,4512,10,5,2015,23,59,True
1,6,0,-122.391523,37.732432,4512,10,5,2015,23,51,False
2,6,4,-122.426002,37.792212,4512,10,5,2015,23,50,True
3,6,2,-122.437394,37.721412,4512,10,5,2015,23,45,True
4,6,2,-122.437394,37.721412,4512,10,5,2015,23,45,True


In [16]:
test['PdDistrict'].value_counts()

7    157456
3    120449
4    107017
0     89591
1     86130
9     81747
2     80084
8     66617
5     50199
6     44972
Name: PdDistrict, dtype: int64

In [17]:
train.drop(columns=['Descript','Resolution'], inplace=True)

In [18]:
train.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,n_days,Day,Month,Year,Hour,Minute,Block
0,WARRANTS,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,False
1,OTHER OFFENSES,2,4,-122.425892,37.774599,4510,13,5,2015,23,53,False
2,OTHER OFFENSES,2,4,-122.424363,37.800414,4510,13,5,2015,23,33,False
3,LARCENY/THEFT,2,4,-122.426995,37.800873,4510,13,5,2015,23,30,True
4,LARCENY/THEFT,2,5,-122.438738,37.771541,4510,13,5,2015,23,30,True


In [19]:
le2 = LabelEncoder()
X = train.drop(columns=['Category'])
y= le2.fit_transform(train['Category'])

In [20]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X, y)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
preds_xgb = xgb.predict(test)

In [22]:
submission = pd.DataFrame(preds_xgb, columns=le2.inverse_transform(np.linspace(0, 38, 39, dtype='int16')), index=test.index)
submission.to_csv('Xgboost.csv', index_label='Id')

ValueError: Shape of passed values is (884262, 1), indices imply (884262, 39)

In [23]:
#train_data = lgb.Dataset(X, label=y, categorical_feature=['PdDistrict', ])

In [24]:
"""params = {'boosting':'gbdt',
          'objective':'multiclass',
          'num_class':39,
          'max_delta_step':0.9,
          'min_data_in_leaf': 21,
          'learning_rate': 0.4,
          'max_bin': 465,
          'num_leaves': 41,
          'verbose' : 1
         }"""

"params = {'boosting':'gbdt',\n          'objective':'multiclass',\n          'num_class':39,\n          'max_delta_step':0.9,\n          'min_data_in_leaf': 21,\n          'learning_rate': 0.4,\n          'max_bin': 465,\n          'num_leaves': 41,\n          'verbose' : 1\n         }"

In [25]:
#bst = lgb.train(params, train_data, 120)

In [26]:
#predictions = bst.predict(test)

In [27]:
#submission = pd.DataFrame(predictions, columns=le2.inverse_transform(np.linspace(0, 38, 39, dtype='int16')), index=test.index)
#submission.to_csv('LGBM_final.csv', index_label='Id')