In [1]:
pip install xgboost lightgbm

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-3.3.4-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: xgboost, lightgbm
Successfully installed lightgbm-3.3.4 xgboost-1.7.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Tabular-Playground-Jan-2023-2/train.csv'
file_key_2 = 'Tabular-Playground-Series/Tabular-Playground-Jan-2023-2/test.csv'
file_key_3 = 'Tabular-Playground-Series/Tabular-Playground-Jan-2023-2/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

In [3]:
from bisect import bisect_left

def rank_test(train_preds, test_preds):
    # rank train preds
    train_rank = rankdata(train_preds)
    preds_rank_zip = list(zip(train_preds, train_rank))

    # sort zipped list by first key
    preds_rank_zip.sort(key = lambda x: x[0])

    # unzip the sorted zipped list
    train_preds, train_rank = zip(*preds_rank_zip)

    # use the closest prediction to get the closets rank
    return np.array([
        train_rank[
            bisect_left(train_preds, x)
        ] for x in test_preds
    ])

# Logistic Regression

In [6]:
train_logit = train.copy()
test_logit = test.copy()

## Defining scaler
scaler = MinMaxScaler()

## Defining inputs and target
train_dummies = pd.get_dummies(train_logit[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
train_dummies = train_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

X = train_logit.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke'], axis = 1)
X = pd.concat([X, train_dummies], axis = 1)
Y = train_logit['stroke']

test_dummies = pd.get_dummies(test_logit[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
test_dummies = test_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

test_logit = test_logit.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis = 1)
test_logit = pd.concat([test_logit, test_dummies], axis = 1)
test_logit = pd.DataFrame(scaler.fit_transform(test_logit), columns = test_logit.columns)

cv_scores, roc_auc_scores = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
        
        ## Scaling the data
        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X.columns)
        
        ## Building logistic model
        logit_md = LogisticRegression(C = 0.2, penalty = 'l1', solver = 'saga', max_iter = 1000).fit(X_train, Y_train)
        
        ## Predicting on X_test and test
        logit_pred_1 = logit_md.predict_proba(X_test)[:, 1]
        logit_pred_2 = logit_md.predict_proba(test_logit)[:, 1]
        
        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, logit_pred_1))
        preds.append(logit_pred_2)
        
    cv_scores.append(np.mean(roc_auc_scores))

logit_cv_score = np.mean(cv_scores)    
print('The roc-auc score over 5-folds (run 5 times) is:', logit_cv_score)

## Building model in the entire train dataset
logit_md = LogisticRegression(C = 0.2, penalty = 'l1', solver = 'saga', max_iter = 10000).fit(X, Y)

logit_preds_train = logit_md.predict_proba(X)[:, 1]
logit_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

The roc-auc score over 5-folds (run 5 times) is: 0.8834240968293544


# SVM

In [9]:
cv_scores, roc_auc_scores = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
        
        ## Scaling the data
        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X.columns)
        
        ## Building logistic model
        svm_md = SVC(C = 100, gamma = 1, kernel = 'rbf', probability = True).fit(X_train, Y_train)
        
        ## Predicting on X_test and test
        svm_pred_1 = svm_md.predict_proba(X_test)[:, 1]
        svm_pred_2 = svm_md.predict_proba(test_logit)[:, 1]
        
        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, svm_pred_1))
        preds.append(svm_pred_2)
        
    cv_scores.append(np.mean(roc_auc_scores))

svm_cv_score = np.mean(cv_scores)    
print('The roc-auc score over 5-folds (run 5 times) is:', svm_cv_score)

## Building model in the entire train dataset
X_trans = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
svm_md = SVC(C = 100, gamma = 1, kernel = 'rbf', probability = True).fit(X_trans, Y)

svm_preds_train = svm_md.predict_proba(X_trans)[:, 1]
svm_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

The roc-auc score over 5-folds (run 5 times) is: 0.69579513665679


# Random Forest

In [10]:
train_RF = train.copy()
test_RF = test.copy()

## Defining inputs and target
train_dummies = pd.get_dummies(train_RF[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
train_dummies = train_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

X = train_RF.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke'], axis = 1)
X = pd.concat([X, train_dummies], axis = 1)
Y = train_RF['stroke']

test_dummies = pd.get_dummies(test_RF[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
test_dummies = test_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

test_RF = test_RF.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis = 1)
test_RF = pd.concat([test_RF, test_dummies], axis = 1)

cv_scores, roc_auc_scores = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
        ## Building RF model
        RF_md = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 300).fit(X_train, Y_train)
        
        ## Predicting on X_test and test
        RF_pred_1 = RF_md.predict_proba(X_test)[:, 1]
        RF_pred_2 = RF_md.predict_proba(test_RF)[:, 1]
        
        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, RF_pred_1))
        preds.append(RF_pred_2)
        
    cv_scores.append(np.mean(roc_auc_scores))

RF_cv_score = np.mean(cv_scores)    
print('The roc-auc score over 5-folds (run 5 times) is:', RF_cv_score)

## Building model in the entire train dataset
RF_md = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 300).fit(X, Y)

RF_preds_train = RF_md.predict_proba(X)[:, 1]
RF_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

The roc-auc score over 5-folds (run 5 times) is: 0.8852571565023715


# XGBoost

In [11]:
train_XGB = train.copy()
test_XGB = test.copy()

## Defining inputs and target
train_dummies = pd.get_dummies(train_XGB[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
train_dummies = train_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

X = train_XGB.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke'], axis = 1)
X = pd.concat([X, train_dummies], axis = 1)
Y = train_XGB['stroke']

test_dummies = pd.get_dummies(test_XGB[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
test_dummies = test_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

test_XGB = test_XGB.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis = 1)
test_XGB = pd.concat([test_XGB, test_dummies], axis = 1)

cv_scores, roc_auc_scores = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
        ## Building RF model
        XGB_md = XGBClassifier(colsample_bytree = 0.8, 
                               gamma = 0.3, 
                               learning_rate = 0.01, 
                               max_depth = 5, 
                               min_child_weight = 10, 
                               n_estimators = 500, 
                               subsample = 0.8).fit(X_train, Y_train)
        
        ## Predicting on X_test and test
        XGB_pred_1 = XGB_md.predict_proba(X_test)[:, 1]
        XGB_pred_2 = XGB_md.predict_proba(test_XGB)[:, 1]
        
        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, XGB_pred_1))
        preds.append(XGB_pred_2)
        
    cv_scores.append(np.mean(roc_auc_scores))

XGB_cv_score = np.mean(cv_scores)    
print('The roc-auc score over 5-folds (run 5 times) is:', XGB_cv_score)

## Building model in the entire train dataset
XGB_md = XGBClassifier(colsample_bytree = 0.8, 
                       gamma = 0.3, 
                       learning_rate = 0.01, 
                       max_depth = 5, 
                       min_child_weight = 10, 
                       n_estimators = 500, 
                       subsample = 0.8).fit(X, Y)

XGB_preds_train = XGB_md.predict_proba(X)[:, 1]
XGB_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

The roc-auc score over 5-folds (run 5 times) is: 0.8853855128396434


# LightGBM

In [12]:
train_lgb = train.copy()
test_lgb = test.copy()

## Defining inputs and target
train_dummies = pd.get_dummies(train_lgb[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
train_dummies = train_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

X = train_lgb.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke'], axis = 1)
X = pd.concat([X, train_dummies], axis = 1)
Y = train_lgb['stroke']

test_dummies = pd.get_dummies(test_lgb[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
test_dummies = test_dummies.drop(columns = ['gender_Other', 'ever_married_No', 'work_type_children', 'Residence_type_Urban', 'smoking_status_Unknown'])

test_lgb = test_lgb.drop(columns = ['id', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis = 1)
test_lgb = pd.concat([test_lgb, test_dummies], axis = 1)

cv_scores, roc_auc_scores = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
        ## Building RF model
        lgb_md = LGBMClassifier(n_estimators = 1000,
                                max_depth = 7,
                                learning_rate = 0.01,
                                num_leaves = 20,
                                lambda_l1 = 3,
                                lambda_l2 = 3,
                                bagging_fraction = 0.7,
                                feature_fraction = 0.7).fit(X_train, Y_train)
        
        ## Predicting on X_test and test
        lgb_pred_1 = lgb_md.predict_proba(X_test)[:, 1]
        lgb_pred_2 = lgb_md.predict_proba(test_lgb)[:, 1]
        
        ## Computing roc-auc score
        roc_auc_scores.append(roc_auc_score(Y_test, lgb_pred_1))
        preds.append(lgb_pred_2)
        
    cv_scores.append(np.mean(roc_auc_scores))

lgb_cv_score = np.mean(cv_scores)    
print('The roc-auc score over 5-folds (run 5 times) is:', lgb_cv_score)

## Building model in the entire train dataset
lgb_md = LGBMClassifier(n_estimators = 1000,
                        max_depth = 7,
                        learning_rate = 0.01,
                        num_leaves = 20,
                        lambda_l1 = 3,
                        lambda_l2 = 3,
                        bagging_fraction = 0.7,
                        feature_fraction = 0.7).fit(X, Y)

lgb_preds_train = lgb_md.predict_proba(X)[:, 1]
lgb_preds = pd.DataFrame(preds).apply(np.mean, axis = 0)

The roc-auc score over 5-folds (run 5 times) is: 0.8826822779125866


# Ensemble

In [18]:
X_ensemble = pd.DataFrame({'logistic': logit_preds_train, 'SVM': svm_preds_train, 'RF': RF_preds_train, 'XGB': XGB_preds_train, 'LightGBM': lgb_preds_train})
X_test_ensemble = pd.DataFrame({'logistic': logit_preds, 'SVM': svm_preds, 'RF': RF_preds, 'XGB': XGB_preds, 'LightGBM': lgb_preds})

## Defining the hyper-parameter grid
RF_param_grid = {'n_estimators': [100, 300, 500], 
                 'max_depth': [3, 5, 7], 
                 'min_samples_split': [2, 6, 10], 
                 'min_samples_leaf': [1, 5, 9]
                }

## Performing grid search with 5 folds
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 5, scoring = 'roc_auc', n_jobs = -1).fit(X_ensemble, Y)

## Extracting the best model
best_params = RF_grid_search.best_params_
print(best_params)

{'max_depth': 7, 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 300}


In [19]:
RF_ensemble_md = RandomForestClassifier(n_estimators = 300, max_depth = 7, min_samples_split = 6, min_samples_leaf = 9).fit(X_ensemble, Y)

RF_ensemble_pred = RF_ensemble_md.predict_proba(X_test_ensemble)[:, 1]

submission['stroke'] = RF_ensemble_pred
submission.head()

Unnamed: 0,id,stroke
0,15304,0.35239
1,15305,0.539846
2,15306,0.000146
3,15307,0.043453
4,15308,0.487708


In [20]:
submission.to_csv('stacking_submission.csv', index = False)