In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import math
import os
import gc
import sys
import pickle
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, plot_confusion_matrix, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, cv, sum_models
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from six.moves import xrange
from sklearn import preprocessing
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

In [5]:
df_train = pd.read_csv("alfa1_df_train6.csv")
df_valid = pd.read_csv("alfa1_df_valid6.csv")

In [6]:
df_train.fillna('nothing', inplace=True)
df_valid.fillna('nothing', inplace=True)

In [7]:
df_train = reduce_mem_usage(df_train)
df_valid = reduce_mem_usage(df_valid)

Memory usage of dataframe is 1775.78MB
Memory usage after optimization is: 337.39MB
Decreased by 81.0%
Memory usage of dataframe is 49.16MB
Memory usage after optimization is: 12.29MB
Decreased by 75.0%


In [8]:
df_train_exp = pd.read_csv("alfa1_df_train10.csv")
df_valid_exp = pd.read_csv("alfa1_df_valid10.csv")

In [9]:
df_train_exp = reduce_mem_usage(df_train_exp)
df_valid_exp = reduce_mem_usage(df_valid_exp)

Memory usage of dataframe is 1369.27MB
Memory usage after optimization is: 345.40MB
Decreased by 74.8%
Memory usage of dataframe is 37.91MB
Memory usage after optimization is: 12.50MB
Decreased by 67.0%


In [10]:
df_train_exp1 = pd.read_csv("alfa1_df_train11.csv")
df_valid_exp1 = pd.read_csv("alfa1_df_valid11.csv")

In [11]:
df_train_exp1 = reduce_mem_usage(df_train_exp1)
df_valid_exp1 = reduce_mem_usage(df_valid_exp1)

Memory usage of dataframe is 1369.27MB
Memory usage after optimization is: 345.40MB
Decreased by 74.8%
Memory usage of dataframe is 37.91MB
Memory usage after optimization is: 12.50MB
Decreased by 67.0%


In [12]:
df_train_exp2 = pd.read_csv("alfa1_df_train12.csv")
df_valid_exp2 = pd.read_csv("alfa1_df_valid12.csv")

In [13]:
df_train_exp2 = reduce_mem_usage(df_train_exp2)
df_valid_exp2 = reduce_mem_usage(df_valid_exp2)

Memory usage of dataframe is 1369.27MB
Memory usage after optimization is: 345.40MB
Decreased by 74.8%
Memory usage of dataframe is 37.91MB
Memory usage after optimization is: 12.50MB
Decreased by 67.0%


In [14]:
aug = df_train_exp.drop(['client_pin', 'lag_1', 'lag_2', 'weight'], axis=1).columns
aug1 = df_train_exp1.drop(['client_pin', 'lag_1', 'lag_2', 'weight'], axis=1).columns
aug2 = df_train_exp2.drop(['client_pin', 'lag_1', 'lag_2', 'weight'], axis=1).columns

In [15]:
df_train[aug] = df_train_exp[aug]
df_valid[aug] = df_valid_exp[aug]
df_train[aug1] = df_train_exp1[aug1]
df_valid[aug1] = df_valid_exp1[aug1]
df_train[aug2] = df_train_exp2[aug2]
df_valid[aug2] = df_valid_exp2[aug2]

In [16]:
from_parq = ['application_id', 'event_type', 'event_category', 'event_name', 'device_screen_name', 'timezone', 'net_connection_type', 'net_connection_tech']

In [17]:
trn_input_lag_cols = []
for i in range(2, 36):
        trn_input_lag_cols.append(f'lag_{i}')

In [18]:
to_drop = []
to_drop.append('lag_1')
to_drop.append('client_pin')
to_drop.append('weight')
to_drop.append('class_weight')

In [19]:
categorical = trn_input_lag_cols + from_parq + ['most_popular']

In [20]:
df_weight = pd.DataFrame()
df_weight['lag_1'] = df_train['lag_1'].unique()
df_weight['class_weight'] = compute_class_weight(classes=df_train['lag_1'].unique(), y=df_train['lag_1'], class_weight='balanced')**0.5

In [21]:
df_train = df_train.merge(df_weight, how='left', on='lag_1')
df_valid = df_valid.merge(df_weight, how='left', on='lag_1')

In [22]:
weights = np.array(df_train['weight'])**2 * np.array(df_train['class_weight'])
weights_val = np.array(df_valid['weight'])**2 * np.array(df_valid['class_weight'])

In [23]:
le1 = preprocessing.LabelEncoder()
le1.fit(df_valid['lag_1'].unique())

LabelEncoder()

In [24]:
df_train['lag_1'] = le1.transform(df_train['lag_1'])
df_valid['lag_1'] = le1.transform(df_valid['lag_1'])

In [25]:
X, y = pd.DataFrame(pd.concat((df_valid.drop(to_drop, axis=1), df_train.drop(to_drop, axis=1))).reset_index(drop=True)), pd.concat((df_valid['lag_1'], df_train['lag_1'])).reset_index(drop=True)
weights = np.concatenate([weights_val,weights])

In [26]:
X['event_type'] = X['event_type'].astype('category')
X['net_connection_tech'] = X['net_connection_tech'].astype('category')

In [27]:
n_splits = 8
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)

In [28]:
trn_idx, val_idx = list(folds.split(X, y))[6]

In [29]:
X, y = X.iloc[trn_idx],  y.iloc[trn_idx]
weights = weights[trn_idx, ]

In [30]:
lgb_train = lgb.Dataset(X, y, weight=weights)
lgb_eval = lgb.Dataset(df_valid.drop(to_drop, axis=1), df_valid['lag_1'], reference=lgb_train, weight=weights_val)

In [31]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(10, -1).T
    preds = preds.argmax(axis = 1)
    f_score = f1_score(labels , preds,  average = 'macro')
    return 'f1_score', f_score, True

In [33]:
tree_params = {
    "objective" : "multiclass",
    'metric':'custom',
    "num_class" : 10,
    'learning_rate': 0.12,
    'max_depth': 5,
    'n_jobs': 5,
    "num_leaves" : 24,
    'boosting':'dart',
    "bagging_fraction" : 0.9,  # subsample
    "feature_fraction" : 0.9,  # colsample_bytree
    "bagging_freq" : 5,        # subsample_freq
    "bagging_seed" : 2020,
    'n_estimators': 1000
}

In [34]:
model = lgb.train(tree_params, 
                  lgb_train,
                  feval = evalerror,
                  valid_sets=[lgb_eval],
                  early_stopping_rounds=250)



[1]	valid_0's f1_score: 0.0603859




[2]	valid_0's f1_score: 0.141858
[3]	valid_0's f1_score: 0.22864
[4]	valid_0's f1_score: 0.313181
[5]	valid_0's f1_score: 0.356835
[6]	valid_0's f1_score: 0.379498
[7]	valid_0's f1_score: 0.391929
[8]	valid_0's f1_score: 0.382797
[9]	valid_0's f1_score: 0.395373
[10]	valid_0's f1_score: 0.401253
[11]	valid_0's f1_score: 0.404843
[12]	valid_0's f1_score: 0.40323
[13]	valid_0's f1_score: 0.406315
[14]	valid_0's f1_score: 0.408438
[15]	valid_0's f1_score: 0.409548
[16]	valid_0's f1_score: 0.411538
[17]	valid_0's f1_score: 0.41223
[18]	valid_0's f1_score: 0.412927
[19]	valid_0's f1_score: 0.41386
[20]	valid_0's f1_score: 0.414282
[21]	valid_0's f1_score: 0.41429
[22]	valid_0's f1_score: 0.414651
[23]	valid_0's f1_score: 0.415589
[24]	valid_0's f1_score: 0.416699
[25]	valid_0's f1_score: 0.417044
[26]	valid_0's f1_score: 0.417425
[27]	valid_0's f1_score: 0.417709
[28]	valid_0's f1_score: 0.418015
[29]	valid_0's f1_score: 0.418317
[30]	valid_0's f1_score: 0.418843
[31]	valid_0's f1_score: 0.

In [35]:
with open('lgb_model8_fold6.pkl', 'wb') as fout:
    pickle.dump(model, fout)