### Imports

In [None]:
import numpy as np

import pandas as pd

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from category_encoders import TargetEncoder

from matplotlib import pyplot as plt 
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

import seaborn as sns

import missingno as mno

import xgboost as xgb

import lightgbm as lgbm

import catboost as catb

### Data Load

In [None]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

y_train = train['Category']
X_train = train.drop('Category', axis = 1)
X_test = test.drop('Id', axis = 1)

### Summarize

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
X_train.nunique()

In [None]:
y_train.nunique()

In [None]:
X_train.isna().sum()

### Date Parsing

In [None]:
X_train['Dates'] = pd.to_datetime(X_train['Dates'])
X_test['Dates'] = pd.to_datetime(X_test['Dates'])

In [None]:
def parseDate(df):
    df['Year'] = df['Dates'].dt.year
    df['Month'] = df['Dates'].dt.month
    df['Hour'] = df['Dates'].dt.hour
    df = df.drop('Dates', axis = 1)
    return df

X_train = parseDate(X_train)
X_test = parseDate(X_test)

X_train.head()

### Seperate 'descript' and drop 'resolution'

In [None]:
D_train = X_train['Descript']
X_train = X_train.drop(['Descript', 'Resolution'], axis = 1)

### Encoding

In [None]:
la_enc = LabelEncoder()
y_train = pd.DataFrame(la_enc.fit_transform(y_train))
y_train

### Scaling

In [None]:
scaler = StandardScaler()
scaling_features = ['X', 'Y']

scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train[scaling_features]), columns = scaling_features)
scaled_X_test = pd.DataFrame(scaler.transform(X_test[scaling_features]), columns = scaling_features)

X_train[scaling_features] = scaled_X_train
X_test[scaling_features] = scaled_X_test

X_train.describe()

### Type decision

In [None]:
cat_features = X_train.select_dtypes(['int64', 'object']).columns

X_train[cat_features] = X_train[cat_features].astype('category')
X_test[cat_features] = X_test[cat_features].astype('category')

X_train.info()

In [None]:
X_test.info()

### Modeling and Scoring

Catboost

In [None]:
params = {
    'learning_rate' : [0.04, 0.07, 1.0],
    'iterations' : [100, 200],
    'depth' : [7]
}

model = catb.CatBoostClassifier(cat_features = cat_features.to_list(), task_type="GPU", devices='00000000:01:00.0')

grid_catb = GridSearchCV(model, param_grid=params, scoring = 'neg_log_loss', cv=3, refit=True)
grid_catb.fit(X_train, y_train)

print('best params : ', grid_catb.best_params_)
print('best score : ', grid_catb.best_score_)


XGBoost Classifier

In [None]:
params = {
    'learning_rate' : [0.04, 0.07],
    'n_estimators' : [100, 300],
    'max_depth' : [5]
}
model = xgb.XGBClassifier(enable_categorical=True, tree_method = 'gpu_hist', gpu_id = 0)

grid_xgb = GridSearchCV(model, param_grid=params, scoring='neg_log_loss', cv=3, refit=True)
grid_xgb.fit(X_train, y_train)

print('best params : ', grid_xgb.best_params_)
print('best score : ', grid_xgb.best_score_)

Use 'D'

In [None]:
la_enc2 = LabelEncoder()
D_train = pd.DataFrame(la_enc2.fit_transform(D_train), columns = ['Descript'])
D_train.head()

In [None]:
params = {
    'learning_rate' : [0.07],
    'iterations' : [100],
    'depth' : [7]
}

model = catb.CatBoostClassifier(cat_features = ['Descript'], task_type="GPU", devices='00000000:01:00.0')

grid_catb = GridSearchCV(model, param_grid=params, scoring='neg_log_loss', cv=3, refit=True)
grid_catb.fit(D_train, y_train)

print('best params : ', grid_catb.best_params_)
print('best score : ', grid_catb.best_score_)

In [None]:
model = catb.CatBoostClassifier(cat_features = cat_features.to_list(), task_type="GPU", devices='00000000:01:00.0',
   learning_rate = 0.07,
   iterations = 100,
   depth = 4
)
model.fit(X_train, D_train)
D_test = model.predict(X_test, prediction_type='Class')
D_test

### Submission

Catboost

In [None]:
model = catb.CatBoostClassifier(cat_features = cat_features.to_list(), task_type="GPU", devices='00000000:01:00.0',
   learning_rate = 0.07,
   iterations = 100,
   depth = 7
)
model.fit(X_train, y_train)
y_test = model.predict_proba(X_test)
y_test.shape

XGBoost

In [None]:
model = xgb.XGBClassifier(
   enable_categorical=True, 
   tree_method = 'gpu_hist', 
   gpu_id = 0,
   learning_rate = 0.07,
   n_estimators = 100,
   max_depth = 5
)
model.fit(X_train, y_train)
y_test = model.predict_proba(X_test)
y_test.shape

Inverse transform

In [None]:
labels = la_enc.inverse_transform(np.arange(0,39))
y_test = pd.DataFrame(y_test, columns = labels)
y_test = pd.concat([test['Id'], y_test], axis = 1)
y_test.head()

Make submission

In [None]:

y_test.to_csv("./submission.csv", index = False)
