In [481]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

### Load Data

In [482]:
train_df = pd.read_csv(r'..\Data\train.csv')

In [483]:
train_df

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1526.0,7.0,,,,...,oslk,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,
1,,,,,,525.0,0.0,,,,...,oslk,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,
2,,,,,,5236.0,7.0,,,,...,Al6ZaUT,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,
3,,,,,,,0.0,,,,...,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
4,,,,,,1029.0,7.0,,,,...,oslk,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,,,,,,357.0,0.0,,,,...,oslk,EROH7Cg,LM8l689qOp,,,7FJQ,RAYp,F2FyR07IdsN7I,,
49996,,,,,,1078.0,0.0,,,,...,oslk,GfSQowC,LM8l689qOp,,kG3k,FSa2,RAYp,55YFVY9,am7c,
49997,,,,,,2807.0,7.0,,,,...,oslk,dh6qI2t,LM8l689qOp,,ELof,fKCe,RAYp,TCU50_Yjmm6GIBZ0lL_,,
49998,,,,0.0,,,,,,,...,oslk,2fF2Oqu,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,


In [484]:
train_labels = pd.read_csv(r'..\Data\train_churn_labels.csv')

In [485]:
cat_cols = train_df.select_dtypes(include = 'object').columns.tolist()
 # numerical and other features 
num_cols = train_df.select_dtypes(exclude = 'object').columns.tolist()

In [486]:
print('Categorical features : ',cat_cols)

Categorical features :  ['Var191', 'Var192', 'Var193', 'Var194', 'Var195', 'Var196', 'Var197', 'Var198', 'Var199', 'Var200', 'Var201', 'Var202', 'Var203', 'Var204', 'Var205', 'Var206', 'Var207', 'Var208', 'Var210', 'Var211', 'Var212', 'Var213', 'Var214', 'Var215', 'Var216', 'Var217', 'Var218', 'Var219', 'Var220', 'Var221', 'Var222', 'Var223', 'Var224', 'Var225', 'Var226', 'Var227', 'Var228', 'Var229']


In [487]:
### Concat the labels with training data
# train_df = pd.concat([train_df, train_labels], axis = 1)

### Train Test split

In [488]:
X_train, X_test, y_train, y_test = train_test_split(train_df, train_labels, test_size = 0.30, random_state = 42)


In [489]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((35000, 230), (15000, 230), (35000, 1), (15000, 1))

In [490]:
# Remove features with missing value more than 60%
remove_null_cols_df = (X_train[cat_cols].isna().sum()/X_train.shape[0]).reset_index().rename(columns = {0: 'Percentage'})

In [491]:
remove_null_cols = remove_null_cols_df[remove_null_cols_df['Percentage'] > 0.6]['index'].tolist()

In [492]:
# removing null cols with null values percentage greater than 60%
cat_cols = list(set(cat_cols).difference(set(remove_null_cols)))

In [493]:
X_train = pd.concat([X_train, y_train], axis =1)

In [494]:
X_test = pd.concat([X_test, y_test], axis =1)

### Variance In Categorical columns

In [495]:
# Finding variance in Catgorical data
var_cat_df = (X_train[cat_cols].nunique()).reset_index().rename(columns = {0: 'Cardinality'})

In [496]:
remove_cat_cols = var_cat_df[var_cat_df['Cardinality'] > 5]['index'].tolist()

In [497]:
remove_cat_cols = remove_cat_cols + ['Var198', 'Var200']

In [498]:
len(cat_cols)

32

In [499]:
cat_cols = list(set(cat_cols).difference(set(remove_cat_cols)))

In [500]:
len(cat_cols)

9

In [501]:
X_train[cat_cols].nunique()

Var223    4
Var218    2
Var208    2
Var225    3
Var196    4
Var229    4
Var203    5
Var211    2
Var205    3
dtype: int64

### Numerical Columns

In [502]:
# Remove features with missing value more than 60%
remove_null_cols_num_df = (X_train[num_cols].isna().sum()/X_train.shape[0]).reset_index().rename(columns = {0: 'Percentage'})

In [503]:
remove_null_cols_num = remove_null_cols_num_df[remove_null_cols_num_df['Percentage'] > 0.4]['index'].tolist()

In [504]:
num_cols = list(set(num_cols).difference(set(remove_null_cols_num)))

In [505]:
len(num_cols)

39

In [506]:
def find_null_cols(df, percentage):
    tolerance = 0.00001
    columns = df[(df['Percentage'] >= (percentage - tolerance)) &(df['Percentage'] <= (percentage + tolerance))]['index'].tolist()
    print(f'Columns with same percentage of null values: {columns} and their percentage :',df[df['index'].isin(columns)]['Percentage'].unique())
    return columns

In [507]:
# identifying columns with 0 null values
zero_null_values_cols = find_null_cols(remove_null_cols_num_df.copy(), percentage = 0)

Columns with same percentage of null values: ['Var57', 'Var73', 'Var113'] and their percentage : [0.]


In [508]:
num_cols_null_val = list(set(num_cols).difference(zero_null_values_cols))

In [509]:
X_train = X_train[~(X_train[num_cols_null_val].isna().all(axis = 1))].copy()

In [510]:
# identifying less than 8 categories
#cardinality less than 8
cat_8 = []
for i in cat_cols:
    if X_train[i].nunique() < 8:
        cat_8.append(i)

In [511]:
# variables to remove based on distribution check
cols_to_remv = ['Var160', 'Var119', 'Var25', 'Var123', 'Var140', 'Var85', 'Var28', 'Var24']

In [512]:
len(num_cols)

39

In [513]:
num_cols = list(set(num_cols).difference(set(cols_to_remv)))

### NNumeical columns checking for discrete numerical values

In [514]:
X_train[num_cols].describe()

Unnamed: 0,Var143,Var6,Var83,Var132,Var134,Var81,Var38,Var7,Var144,Var181,...,Var173,Var22,Var74,Var21,Var57,Var125,Var126,Var35,Var76,Var65
count,31466.0,31097.0,31466.0,31466.0,31466.0,31097.0,31466.0,31076.0,31097.0,31466.0,...,31466.0,31466.0,31076.0,31097.0,34501.0,31076.0,25290.0,31466.0,31466.0,31076.0
mean,0.057205,1322.698813,20.123308,3.533465,439504.8,103704.0,2579410.0,6.819571,11.766826,0.605765,...,0.006038,289.102364,106.2844,233.769303,3.515532,27975.05,-0.623171,0.716805,1502281.0,14.918522
std,0.630216,2555.393884,92.305502,10.02087,605012.9,107305.8,3002251.0,6.299199,11.751181,2.462963,...,0.117559,701.521448,901.541096,564.027372,2.022906,93759.42,22.554281,3.037231,1872842.0,10.152472
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000214,0.0,-32.0,0.0,0.0,9.0
25%,0.0,518.0,0.0,0.0,30403.5,16644.15,8440.5,0.0,0.0,0.0,...,0.0,135.0,0.0,112.0,1.752403,234.0,-20.0,0.0,95172.0,9.0
50%,0.0,854.0,10.0,0.0,212263.0,73976.09,1301715.0,7.0,9.0,0.0,...,0.0,180.0,7.0,144.0,3.519333,6480.0,4.0,0.0,895476.0,9.0
75%,0.0,1428.0,25.0,0.0,621459.0,182672.4,4570744.0,7.0,18.0,0.0,...,0.0,285.0,91.0,228.0,5.25956,31851.0,10.0,0.0,2329778.0,18.0
max,18.0,115045.0,6335.0,184.0,5159340.0,1814403.0,18846900.0,42.0,81.0,28.0,...,4.0,45340.0,142156.0,36272.0,7.0,5436045.0,68.0,110.0,19353600.0,126.0


In [515]:
# finding columns in numerical columns with cardinality <= 10
num_card_10 = []
for i in num_cols:
    # print(f'Columns:{i} with cardinality:',X_train[i].nunique())
    if X_train[i].nunique() <=10:
        num_card_10.append(i)
        # print(f'Columns:{i} with cardinality:',X_train[i].nunique())

In [516]:
num_card_10

['Var143', 'Var7', 'Var144', 'Var181', 'Var44', 'Var173']

In [517]:
cont_num_cols = list(set(num_cols).difference(set(num_card_10)))

In [518]:
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline  import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [519]:
binary_cols = []
for i in cat_cols:
    if X_train[i].nunique() == 2:
        binary_cols.append(i)

In [520]:
ohe_cols = list(set(cat_cols).difference(set(binary_cols)))

In [521]:
for i in binary_cols:
    X_train[i] = np.where((X_train[i] == sorted(X_train[i].dropna().unique())[0]), 1, 0)
    X_test[i] = np.where((X_test[i] == sorted(X_test[i].dropna().unique())[0]), 1, 0)

In [522]:
for i in binary_cols:
    X_train[i] = X_train[i].astype(int)
    X_test[i] = X_test[i].astype(int)

In [523]:
cont_num_pipeline = Pipeline(steps = [
                 ('cont_num_imputer',SimpleImputer(missing_values=np.nan, strategy='median')),
                ('scale', MinMaxScaler())
                                ])

In [524]:
discrt_num_pipeline = Pipeline(steps = [
                 ('discrt_num_imputer',SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
                                ])

In [525]:
# cat_pipeline = Pipeline(steps = [
#                 ('cat_imputer',SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
#                 # ('encoding_one_hoe', OneHotEncoder(handle_unknown='ignore', sparse_output = False), ohe_cols),
                
#                         ])

In [526]:
col_transformer = ColumnTransformer(transformers = [
                                ('cont_num_pipeline', cont_num_pipeline, cont_num_cols),
                                ('discrt_num_pipeline', discrt_num_pipeline, num_card_10 + binary_cols)
                                #  ('cat_pipeline', cat_pipeline, cat_cols),
                                #('encoding_one_hoe', OneHotEncoder(handle_unknown='ignore', sparse_output = False), ohe_cols)
                                #  ('ordinal_encoding', OrdinalEncoder(), ordinal_cols )
                                ],
                                    remainder ='drop',
                                    n_jobs= -1)

In [527]:
from imblearn.over_sampling import BorderlineSMOTE, SMOTE

In [528]:
pipe_lgbm = imbPipeline([('col_transform', col_transformer),
                                    #  ('over_sampler_smote', SMOTE(random_state=2021,
                                    #                            k_neighbors = 10,
                                    #                           #  m_neighbors = 10,
                                    #                                n_jobs=-1,
                                    #                               sampling_strategy='minority')),
                                        ('lgb', lgb.LGBMClassifier(random_state = 2024, 
                                                                n_estimators= 800, 
                                                                learning_rate = 0.1,
                                                                 boosting_type = 'gbdt',
                                                                 max_depth = 6,
                                                                  subsample = 0.7,
                                                                  class_weight ='balanced',
                                                                   n_jobs = -1 ))                                    
                                     ])

In [529]:
X_train = X_train.reset_index(drop=True)

In [531]:
select_cols = cont_num_cols + num_card_10 + binary_cols #ohe_cols 

In [532]:
X_train[select_cols]

Unnamed: 0,Var149,Var153,Var78,Var112,Var6,Var133,Var113,Var109,Var163,Var73,...,Var65,Var143,Var7,Var144,Var181,Var44,Var173,Var218,Var208,Var211
0,0.0,3988324.0,0.0,16.0,798.0,1386505.0,-919872.00,32.0,603510.0,48,...,9.0,0.0,7.0,9.0,0.0,0.0,0.0,1,1,1
1,,,,,,,-1426736.00,,,10,...,,,,,,,,1,0,1
2,523166.0,3391948.0,0.0,0.0,819.0,442245.0,132696.40,32.0,1817898.0,146,...,36.0,0.0,28.0,9.0,0.0,0.0,0.0,1,1,1
3,0.0,10368000.0,0.0,64.0,700.0,9504000.0,106276.80,32.0,0.0,38,...,9.0,0.0,7.0,0.0,0.0,0.0,0.0,1,1,1
4,0.0,10805360.0,0.0,0.0,756.0,0.0,170233.60,32.0,0.0,154,...,27.0,0.0,14.0,27.0,0.0,0.0,0.0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34496,117166.0,459432.0,0.0,328.0,3626.0,178860.0,-28499.84,184.0,143694.0,178,...,27.0,0.0,21.0,27.0,0.0,0.0,0.0,0,1,1
34497,676753.0,4833160.0,6.0,56.0,2324.0,2161450.0,1300172.00,88.0,514500.0,144,...,9.0,0.0,7.0,18.0,7.0,0.0,0.0,0,1,1
34498,604800.0,10351840.0,0.0,48.0,1645.0,4749685.0,276099.20,40.0,0.0,62,...,27.0,0.0,14.0,9.0,0.0,0.0,0.0,0,1,1
34499,,0.0,0.0,0.0,0.0,0.0,0.00,,0.0,86,...,9.0,6.0,0.0,0.0,0.0,0.0,0.0,1,1,1


In [533]:
y_train = X_train['Label'].copy()

In [534]:
X_train= X_train[select_cols].copy()

In [536]:
# Model Evaluation with 10-fold cross-validation
cv_scores = cross_val_score(pipe_lgbm, X_train, y_train, cv=10)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {np.mean(cv_scores)}")
pipe_lgbm.fit(X_train[select_cols], y_train)

# Predictions on the test set
y_pred = pipe_lgbm.predict(X_test[select_cols])

[LightGBM] [Info] Number of positive: 2272, number of negative: 28778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4689
[LightGBM] [Info] Number of data points in the train set: 31050, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 2273, number of negative: 28778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4694
[LightGBM] [Info] Number of data points in the train set: 31051, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Ligh

In [537]:
# Evaluate the model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Classification Report:
              precision    recall  f1-score   support

          -1       0.93      0.92      0.93     13894
           1       0.15      0.17      0.16      1106

    accuracy                           0.86     15000
   macro avg       0.54      0.55      0.54     15000
weighted avg       0.88      0.86      0.87     15000

Accuracy: 0.865
