### Imports

In [1]:
# pandas and numpy
import pandas as pd
import pandas_profiling
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# other
import string
import math
import missingno as msno

# data viz
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
# from autoviz.AutoViz_Class import AutoViz_Class
# %matplotlib inline
import plotly.express as px
%matplotlib inline

# sklearn - other
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

# sklearn - ML models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# lightgbm
import lightgbm as lgbm

# xgboost
import xgboost as xgb

# catboost
import catboost as cb

### Data load

In [2]:
members = pd.read_csv("./members.csv")
songs = pd.read_csv("./songs.csv")
test = pd.read_csv("./test.csv")
train = pd.read_csv("./train.csv")

### Summarize

In [41]:
members.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [42]:
train.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### Handle missing values

In [5]:
members = members.drop(["bd", "gender"], axis = 1)

In [6]:
songs = songs.drop(["composer", "lyricist"], axis = 1)

In [7]:
genre_impute = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
songs = pd.DataFrame(genre_impute.fit_transform(songs), columns = songs.columns)

In [8]:
train = train.drop("source_screen_name", axis = 1)
test = test.drop("source_screen_name", axis = 1)

In [9]:
X_test = test.drop("id", axis = 1)
y_train = train["target"]
X_train = train.drop("target", axis = 1)

In [10]:
train_impute = SimpleImputer(missing_values = np.nan, strategy="most_frequent")
X_train = pd.DataFrame(train_impute.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(train_impute.transform(X_test), columns = X_test.columns)

### Merge data

In [11]:
X_train = pd.merge(X_train, members, on='msno', how='left')
X_train = pd.merge(X_train, songs, on='song_id', how='left')
X_train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_type,city,registered_via,registration_init_time,expiration_date,song_length,genre_ids,artist_name,language
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,online-playlist,1,7,20120102,20171005,206471,359,Bastille,52.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,local-playlist,13,9,20110525,20170911,284584,1259,Various Artists,52.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,local-playlist,13,9,20110525,20170911,225396,1259,Nas,52.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,local-playlist,13,9,20110525,20170911,255512,1019,Soundway,-1.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,online-playlist,1,7,20120102,20171005,187802,1011,Brett Young,52.0


In [12]:
X_test = pd.merge(X_test, members, on='msno', how='left')
X_test = pd.merge(X_test, songs, on='song_id', how='left')

### Extracting feature

In [13]:
X_train["registration_init_time"] = pd.to_datetime(X_train["registration_init_time"], format='%Y%m%d', errors='ignore')
X_train["registration_year"] = X_train["registration_init_time"].dt.year
X_train["registration_month"] = X_train["registration_init_time"].dt.month
X_train["registration_day"] = X_train["registration_init_time"].dt.day

X_test["registration_init_time"] = pd.to_datetime(X_test["registration_init_time"], format='%Y%m%d', errors='ignore')
X_test["registration_year"] = X_test["registration_init_time"].dt.year
X_test["registration_month"] = X_test["registration_init_time"].dt.month
X_test["registration_day"] = X_test["registration_init_time"].dt.day

X_train["expiration_date"] = pd.to_datetime(X_train["expiration_date"], format='%Y%m%d', errors='ignore')
X_train["expiration_year"] = X_train["expiration_date"].dt.year
X_train["expiration_month"] = X_train["expiration_date"].dt.month
X_train["expiration_day"] = X_train["expiration_date"].dt.day

X_test["expiration_date"] = pd.to_datetime(X_test["expiration_date"], format='%Y%m%d', errors='ignore')
X_test["expiration_year"] = X_test["expiration_date"].dt.year
X_test["expiration_month"] = X_test["expiration_date"].dt.month
X_test["expiration_day"] = X_test["expiration_date"].dt.day

In [14]:
X_train = X_train.drop(["registration_init_time", "expiration_date"], axis = 1)
X_test = X_test.drop(["registration_init_time", "expiration_date"], axis = 1)

### Encoding

In [15]:
X_train.nunique()

msno                   30755
song_id               359966
source_system_tab          8
source_type               12
city                      21
registered_via             5
song_length            60266
genre_ids                572
artist_name            40582
language                  10
registration_year         14
registration_month        12
registration_day          31
expiration_year           18
expiration_month          12
expiration_day            31
dtype: int64

In [16]:
# oh_features = X_train.columns[X_train.nunique() < 13]
la_features = X_train.columns[X_train.nunique() > 1]


In [17]:
oh_enc = OneHotEncoder(handle_unknown='ignore')

or_enc = OrdinalEncoder(unknown_value = np.nan, handle_unknown='use_encoded_value')
X_train[la_features] = or_enc.fit_transform(X_train[la_features])
X_test[la_features] = or_enc.transform(X_test[la_features])

'''
arr = oh_enc.fit_transform(X_train[oh_features]).toarray()
X_train = pd.concat([X_train, pd.DataFrame(arr, columns = oh_enc.get_feature_names_out()).astype('int64')], axis = 1)
X_train = X_train.drop(oh_features, axis = 1)

arr = oh_enc.transform(X_test[oh_features]).toarray()
X_test = pd.concat([X_test, pd.DataFrame(arr, columns = oh_enc.get_feature_names_out()).astype('int64')], axis = 1)
X_test = X_test.drop(oh_features, axis = 1)
'''


"\narr = oh_enc.fit_transform(X_train[oh_features]).toarray()\nX_train = pd.concat([X_train, pd.DataFrame(arr, columns = oh_enc.get_feature_names_out()).astype('int64')], axis = 1)\nX_train = X_train.drop(oh_features, axis = 1)\n\narr = oh_enc.transform(X_test[oh_features]).toarray()\nX_test = pd.concat([X_test, pd.DataFrame(arr, columns = oh_enc.get_feature_names_out()).astype('int64')], axis = 1)\nX_test = X_test.drop(oh_features, axis = 1)\n"

### Modeling

LGBM

In [19]:
params = {
    'learning_rate' : [0.01, 0.05, 0.1],
    'n_estimators' : [100, 200],
    'max_depth' : [5]
}

model = lgbm.LGBMClassifier()
grid_lgbm = GridSearchCV(model, param_grid=params, scoring = 'roc_auc', cv=3, refit=True, verbose=True)
grid_lgbm.fit(X_train, y_train)

print('best params : ', grid_lgbm.best_params_)
print('best score : ', grid_lgbm.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
best params :  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
best score :  0.6734291394636641


XGBoost

In [36]:
params = {
    'learning_rate' : [0.01, 0.05, 0.1],
    'n_estimators' : [100, 200],
    'max_depth' : [5]
}

model = xgb.XGBClassifier()
grid_xgb = GridSearchCV(model, param_grid=params, scoring = 'roc_auc', cv=3, refit=True, verbose=True)
grid_xgb.fit(X_train, y_train)

print('best params : ', grid_xgb.best_params_)
print('best score : ', grid_xgb.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
best params :  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
best score :  0.6734809421171656


In [37]:
y_test = grid_xgb.best_estimator_.predict_proba(X_test)

In [38]:
y_test = pd.Series(y_test[:, 1], name="target")

### Submission

In [39]:
submission = pd.concat([test.id, y_test], axis = 1)

In [40]:
submission.to_csv("./submission.csv", index = False)