In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import numpy as np

In [3]:
train = pd.read_csv('./data/train.csv', parse_dates=['Dates'])
test = pd.read_csv('./data/test.csv', parse_dates=['Dates'], index_col='Id')

## Wranling the dataset

In [4]:
train.drop_duplicates(inplace=True)
train.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

for district in train['PdDistrict'].unique():
    train.loc[train['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(
        train.loc[train['PdDistrict'] == district, ['X', 'Y']])
    test.loc[test['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        test.loc[test['PdDistrict'] == district, ['X', 'Y']])

## Feature Engineering

In [5]:
def feature_engineering(data):
    data['Date'] = pd.to_datetime(data['Dates'].dt.date)
    data['n_days'] = (
        data['Date'] - data['Date'].min()).apply(lambda x: x.days)
    data['Day'] = data['Dates'].dt.day
    data['DayOfWeek'] = data['Dates'].dt.weekday
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    data['Block'] = data['Address'].str.contains('block', case=False)
    
    data.drop(columns=['Dates','Date','Address'], inplace=True)
        
    return data

In [6]:
train = feature_engineering(train)
train.drop(columns=['Descript','Resolution'], inplace=True)
test = feature_engineering(test)

## Encoding Categorical Variables

In [7]:
PdDistrict_le = LabelEncoder()
train['PdDistrict'] = PdDistrict_le.fit_transform(train['PdDistrict'])
test['PdDistrict'] = PdDistrict_le.transform(test['PdDistrict'])

Category_le = LabelEncoder()
y = Category_le.fit_transform(train.pop('Category'))

## Creating Base Model

### Forming the dataset

In [8]:
train_set = lgb.Dataset(
    train, label=y, categorical_feature=['PdDistrict'], free_raw_data=False)

### Setting the Parameters

In [9]:
params = {
    'objective': 'multiclass',
    'num_class': 39
}

### Cross Validation

In [12]:
cv_results = lgb.cv(params, train_set, metrics='multi_logloss', early_stopping_rounds=10)



In [13]:
print('Best Score: ', min(cv_results['multi_logloss-mean']))

Best Score:  2.471754918792097


In [14]:
# multi_logloss-mean의 최소값의 index
num_boost_round = np.argmin(cv_results['multi_logloss-mean'])
print('Best epoch: ', num_boost_round)

Best epoch:  15


### Training

In [17]:
bst = lgb.train(params, train_set, num_boost_round=num_boost_round)

### Predicting

In [18]:
predictions = bst.predict(test)

### Submitting

In [20]:
submission = pd.DataFrame(predictions,
                         columns=Category_le.classes_,
                         index=test.index)
submission.to_csv('LGM - base model.csv', index_label='Id')