In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics import (roc_auc_score, make_scorer,
                             mean_squared_log_error, mean_squared_error,
                             accuracy_score, f1_score, precision_score, 
                             recall_score, precision_recall_curve)
from lightgbm import LGBMClassifier
from sklearn.base import clone
from xgboost import XGBClassifier
from scipy import stats
from sklearn.model_selection import (StratifiedKFold, RandomizedSearchCV,
                                    GridSearchCV, KFold, train_test_split,
                                    GroupKFold, StratifiedShuffleSplit, cross_val_score)
from itertools import permutations, combinations

from catboost import CatBoostClassifier, Pool
from sklearn.feature_selection import SelectFromModel

In [3]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [4]:
test = pd.read_csv('/kaggle/input/datahack-ml-starter-pack/test_wF0Ps6O.csv')
train = pd.read_csv('/kaggle/input/datahack-ml-starter-pack/train_HK6lq50.csv')

sample = pd.read_csv('/kaggle/input/datahack-ml-starter-pack/sample_submission_vaSxamm.csv')

In [5]:
def interaction_generator_list(df, col_list):
    """
    Create interactions, by concatenating columns in col_list
    """
    print("For: {}".format(col_list))
    new_col_name = 'Interaction_{}'.format('_'.join(col_list))
    df[new_col_name] = df[col_list[0]].astype(str)
    for col in col_list[1:]:
        df[new_col_name] = df[new_col_name] + '_' + df[col].astype(str)
        
    return df

In [None]:
def make_predictions(df, target, model, num_folds=5):
    """
    Function to make bagged predictions on test data, using KFold Cross Validation
    """
    print("Generating train & test splits")
    x_train = df[:target.shape[0]].reset_index(drop=True)
    x_test = df[target.shape[0]:].reset_index(drop=True)
#     display(x_train.isnull().sum())
#     display(x_test.isnull().sum())
    
    folds = KFold(n_splits=num_folds)
    print("Generating Folds")
    indices = [(train_index, test_index) for (train_index, test_index) in folds.split(x_train)]
    
    fitted_models = []
    errors = []
    thres_buf = []
    fscore_buf = []
    for i, (train_index, test_index) in enumerate(indices):
        est = clone(model)
        print("Fitting model {}".format(i))
        if isinstance(est, LGBMClassifier):
            est.fit(X=x_train.iloc[train_index], y=target.iloc[train_index], eval_set=[(x_train.iloc[test_index],target.iloc[test_index])],
                   eval_metric='auc', verbose=100, early_stopping_rounds=100)
        elif isinstance(est, XGBClassifier):
            est.fit(X=x_train.iloc[train_index], y=target.iloc[train_index], eval_set=[(x_train.iloc[test_index],target.iloc[test_index])],
                   eval_metric='auc', verbose=100, early_stopping_rounds=100)
        elif isinstance(est, CatBoostClassifier):
            est.fit(X=x_train.iloc[train_index], y=target.iloc[train_index], eval_set=[(x_train.iloc[test_index],target.iloc[test_index])],
                    verbose=100, early_stopping_rounds=100)
        
        fitted_models.append(est)
        
        fold_pred = np.array(est.predict_proba(x_train.iloc[test_index]))[:,1]
        error = roc_auc_score(target.iloc[test_index], fold_pred)
        errors.append(error)
        
        precision, recall, thresholds = precision_recall_curve(target.iloc[test_index], fold_pred)
        fscore = (2 * precision * recall) / (precision + recall)
        ix = np.argmax(fscore)
        print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
        thres_buf.append(thresholds[ix])
        fscore_buf.append(fscore[ix])
        
        print("{} iteration AUC: {}".format(i, error))
        print("Running Avg. AUC: {}\n".format(np.mean(errors)))
        print('-'*100)
        
    print("Mean AUC: {}".format(np.mean(errors)))
    print("Mean Threshold: {}\t F Score:{}".format(np.mean(thres_buf), np.mean(fscore_buf)))
    
    pred_proba = np.mean(np.array([est.predict_proba(x_test) for est in fitted_models])[:,:,1],axis=0)
    return pred_proba, fitted_models

In [7]:
train.head()

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
0,9389_150,Y_1,Y,136,150,offline,intermediate,9389,M,Matriculation,3,24.0,5,N,1.0,0
1,16523_44,T_1,T,131,44,offline,easy,16523,F,High School Diploma,4,26.0,2,N,3.0,1
2,13987_178,Z_2,Z,120,178,online,easy,13987,M,Matriculation,1,40.0,1,N,2.0,1
3,13158_32,T_2,T,117,32,offline,easy,13158,F,Matriculation,3,,4,N,1.0,1
4,10591_84,V_3,V,131,84,offline,intermediate,10591,F,High School Diploma,1,42.0,2,N,4.0,1


In [6]:
train['is_pass'].value_counts(normalize=True) * 100

1    69.540788
0    30.459212
Name: is_pass, dtype: float64

In [None]:
target_var = 'is_pass'
target = train[target_var]
train_len = target.shape[0]

### Training CatBoost

In [None]:
df = pd.concat([train.drop(target_var, axis=1), test], axis=0, ignore_index=True)

In [None]:
df['age_prod_trainee_engagement_rating'] = df['age'] * df['trainee_engagement_rating']

In [None]:
df['test_id'] = df['test_id'].astype(str)
df['trainee_id'] = df['trainee_id'].astype(str)
df['trainee_engagement_rating'] = df['trainee_engagement_rating'].astype(str)

In [None]:
cols_to_drop = ['id']
df.drop(cols_to_drop, axis=1,inplace=True, errors='ignore')

Run below code to get important features

In [None]:
# cat_feats = df.select_dtypes('object').columns.tolist()
# print(cat_feats)


# X = df[:train_len].drop(target_var, axis=1, errors='ignore').copy()
# sf_rf = SelectFromModel(CatBoostClassifier(
#                                         cat_features=cat_feats,
#                                         depth=16,
#                                         grow_policy='Lossguide',
#                                         eval_metric='AUC',
#                                         learning_rate=0.01,
#                                         loss_function='Logloss',
#                                         verbose=True, task_type='GPU', devices='0'))
# sf_rf.fit(X, target)
# selected_feats = X.columns[sf_rf.get_support()]

In [None]:
selected_feats = ['test_id', 'trainee_id', 'trainee_engagement_rating'] # Top features given by SelectFromModel
print(selected_feats)

In [None]:
for col_list in combinations(selected_feats, 2):
    df = interaction_generator_list(df, list(col_list))

Drop Interaction features with values from 'trainee_id' & 'test_id' columns, as these columns combined they create unique values for all rows.

In [None]:
cols_to_drop2 = [col1 for col1 in [col for col in df.columns if col.startswith('Interaction')] if all([i in col1 for i in ['trainee_id', 'test_id']])]
print("Droping: {}".format(cols_to_drop2))

df.drop(cols_to_drop2, axis=1, errors='ignore', inplace=True)

In [None]:
cat_feats = df.select_dtypes('object').columns.tolist()
print(cat_feats)

In [None]:
cat = CatBoostClassifier(iterations=500,
                         cat_features=cat_feats,
                         depth=16,
                         max_leaves=64,
                         grow_policy='Lossguide',
                         eval_metric='AUC',
                        learning_rate=0.1,
                        loss_function='Logloss',
                         allow_writing_files=False,
                        verbose=True)

In [None]:
cat_pred_proba, cat_models = make_predictions(df, target, cat, num_folds=15)

### Training XgBoost


In [None]:
df = pd.concat([train.drop(target_var, axis=1), test], axis=0, ignore_index=True)

In [None]:
df['is_handicapped'] = df['is_handicapped'].map({'Y': 1, 'N': 0})

In [None]:
col_list = ['program_id', 'program_type','test_type', 'difficulty_level', 'gender', 'education',
       'city_tier', 'is_handicapped']
for col in col_list:
    df = interaction_generator_list(df, ['trainee_id', col])

In [None]:
cols_to_drop = ['id']
df.drop(cols_to_drop, axis=1,inplace=True, errors='ignore')

In [None]:
# Frequency Encoding
for col in df.select_dtypes(object).columns.tolist():
    print("Encoding: {}".format(col))
    df[col] = df[col].map(df[col].value_counts(normalize=True).to_dict())

In [None]:
xgb = XGBClassifier(objective='binary:logistic', learning_rate=0.1, booster='gbtree', n_jobs=-1,
                    n_estimators=1000, max_depth=10, num_leaves=64, grow_policy='lossguide')

In [None]:
xgb_pred_proba, xgb_models = make_predictions(df, target, xgb, num_folds=10)

In [None]:
final_weighted_pred = (0.6 * cat_pred_proba + 0.4 * xgb_pred_proba) / 2

In [None]:
sub_df = pd.DataFrame({'id': test['id'], 'is_pass': final_weighted_pred})

In [None]:
sub_df.to_csv('final_sub.csv', index=False)