In [2]:
import numpy as np 
import pandas as pd
import datetime as dt
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
import lightgbm as lgb
import shap
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold


%config InlineBackend.figure_format = 'retina'

### Task 1: GBDT

In [3]:
train_types = {'msno' : 'category',
               'source_system_tab' : 'category',
               'source_screen_name' : 'category',
               'source_type' : 'category',
               'target' : np.uint8,
               'song_id' : 'category'}

test_types = {'msno' : 'category',
              'source_system_tab' : 'category',
              'source_screen_name' : 'category',
              'source_type' : 'category',
              'song_id' : 'category'}

songs_types = {'genre_ids': 'category',
               'language' : 'category',
               'artist_name' : 'category',
               'composer' : 'category',
               'lyricist' : 'category',
               'song_id' : 'category'}

members_types = {'city' : 'category',
                 'bd' : np.uint8,
                 'gender' : 'category',
                 'registered_via' : 'category'}

train = pd.read_csv('train.csv', dtype=train_types)
test = pd.read_csv('test.csv', dtype=test_types)
songs = pd.read_csv('songs.csv', dtype=songs_types)
members = pd.read_csv('members.csv',dtype=members_types, parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv('song_extra_info.csv')

In [4]:
def print_nan(df):
    for col in df.columns:
        print(f"{100*df[col].isna().sum()/df[col].shape[0]} % Nan of {col}")
        
def year_month_day(col):
    return col.dt.year, col.dt.month, col.dt.day

def isrc_to_year(isrc):
    if type(isrc) == str:
        year = int(isrc[5:7])
        if year > 17:
            return 1900 + year
        else:
            return 2000 + year
    else:
        return np.nan
    
def merge(train, test, df, name):
    train = train.merge(df, on=name, how='left')
    test = test.merge(df, on=name, how='left')
    return train, test

In [6]:
members['gender'] = members['gender'].astype('str').replace('nan', 'unknown').astype('category')
members['registration_year'], members['registration_month'], members['registration_day']  = year_month_day(members.registration_init_time)
members['expiration_year'], members['expiration_month'], members['expiration_day'] = year_month_day(members.expiration_date)

members = members.drop(['registration_init_time', 'expiration_date'], axis=1)
train, test = merge(train, test, members, "msno")
del members

songs = songs.drop(['composer', 'lyricist'], axis=1)
train, test = merge(train, test, songs, "song_id")
del songs 

songs_extra['country'] = songs_extra['isrc'].apply(lambda x: x[:2] if type(x) == str else np.nan)
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)
train, test = merge(train, test, songs_extra, "song_id")
del songs_extra

for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [29]:
def training_lgb(data):
    X_train, y_train, X_test, y_test = data 
    lgbtrain = lgb.Dataset(X_train, y_train)
    lgbval = lgb.Dataset(X_test, y_test)


    params = {'learning_rate': 0.25,
              'metric': 'auc',
              'max_depth': 10, 
              'num_leaves': 256,
              'max_depth': 10}

    gbm = lgb.train(params, 
                      train_set=lgbtrain, 
                      num_boost_round=100, 
                      
                      verbose_eval=25,
                      valid_sets=[lgbtrain, lgbval],)
    y_pred = gbm.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    return gbm, auc

In [8]:
def k_fold_training(X, y, k=5):
    stats = []
    kf = StratifiedKFold(n_splits=k)
    for i, (train_ind, test_ind) in enumerate(kf.split(X, y)):
        print(f"Folder {i+1}:")
    
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, y_test = X.iloc[test_ind], y.iloc[test_ind]

        model, score = training_lgb((X_train, y_train, X_test, y_test))
        stats.append((score, model, X_test))
    stats.sort()
    best_model = stats[-1][1]
    print(f'Mean AUC_ROC: {np.mean([sc[0] for sc in stats]):.3f} with {k} folds')
    return (best_model, X_test)

In [19]:
def split_training(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    model, score = training_lgb((X_train, y_train, X_test, y_test))
    return model

def feture_imps(model):
    df_feat = pd.DataFrame(data=model.feature_importance(), index=model.feature_name(), columns=['importance'])
    df_feat = df_feat.sort_values(by='importance', ascending=False)
    df_feat['feat'] = df_feat.index
    return df_feat

In [15]:
X = train.drop(['target', 'song_id'], axis=1)
y = train['target']

In [16]:
model = split_training(X, y)

[25]	training's auc: 0.737243	valid_1's auc: 0.730979
[50]	training's auc: 0.74853	valid_1's auc: 0.741253
[75]	training's auc: 0.756217	valid_1's auc: 0.747855
[100]	training's auc: 0.763081	valid_1's auc: 0.753637


In [20]:
feture_imps(model)

Unnamed: 0,importance,feat
song_length,4524,song_length
song_year,3807,song_year
artist_name,3057,artist_name
msno,2848,msno
registration_day,1836,registration_day
expiration_day,1680,expiration_day
registration_month,1358,registration_month
bd,1277,bd
registration_year,1205,registration_year
source_screen_name,851,source_screen_name


In [27]:
X = X.drop(['city', 'registered_via'], axis=1)

In [30]:
best_model, X_test = k_fold_training(X, y)

Folder 1:
[25]	training's auc: 0.738652	valid_1's auc: 0.753389
[50]	training's auc: 0.75892	valid_1's auc: 0.758812
[75]	training's auc: 0.770648	valid_1's auc: 0.76044
[100]	training's auc: 0.780685	valid_1's auc: 0.760715
Folder 2:
[25]	training's auc: 0.74974	valid_1's auc: 0.713164
[50]	training's auc: 0.767251	valid_1's auc: 0.716292
[75]	training's auc: 0.779089	valid_1's auc: 0.71661
[100]	training's auc: 0.78884	valid_1's auc: 0.715826
Folder 3:
[25]	training's auc: 0.754279	valid_1's auc: 0.675861
[50]	training's auc: 0.773657	valid_1's auc: 0.677885
[75]	training's auc: 0.785102	valid_1's auc: 0.676911
[100]	training's auc: 0.794754	valid_1's auc: 0.674824
Folder 4:
[25]	training's auc: 0.760424	valid_1's auc: 0.652093
[50]	training's auc: 0.777514	valid_1's auc: 0.655995
[75]	training's auc: 0.789014	valid_1's auc: 0.65692
[100]	training's auc: 0.798019	valid_1's auc: 0.656957
Folder 5:
[25]	training's auc: 0.766556	valid_1's auc: 0.635349
[50]	training's auc: 0.782952	vali