<a href="https://colab.research.google.com/github/Altusya/Case-Study/blob/main/5)_selected_model_and_prediction_(GB).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, StratifiedKFold, LeaveOneOut
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC

url = "https://raw.githubusercontent.com/Altusya/Case-Study/main/data_new.csv"
data = pd.read_csv(url)
print("full data: ", data.shape)
pd.set_option("display.max_columns", None)
print(data.head(1))
known = data[data['y'] != "unknown"]
print("known data: ", known.shape)

full data:  (45211, 33)
   client_id  age         job  marital education  account_id in_default  \
0    1000583   43  management  married  tertiary       19298         no   

   balance housing loan   contact  campaign_id  day month  duration  campaign  \
0   -127.0      no   no  cellular        19298    6   aug       400         2   

   pdays  previous poutcome        y age_cat  age_log balance_cat  balance_pt  \
0     -1         0  unknown  unknown   36-45   3.7612    negative    -0.71255   

  duration_cat  duration_log campaign_cat  campaign_log pdays_cat  pdays_pt  \
0      300-700      5.991465        twice      0.693147     never -0.472533   

  previous_cat  previous_pt  season  
0        never      -0.4725       2  
known data:  (4521, 33)


In [3]:
# Scenario 1: basic features
num_features = ["age", "balance", "duration", "campaign", "pdays", "previous"]
cat_features = ["job", "marital", "education", "in_default", "housing", "loan", "month", "poutcome"]

# Scenario 2: log/power transformed num features
num_features = ["age_log", "balance_pt", "duration_log", "campaign_log", "pdays_pt", "previous_pt"]
cat_features = ["job", "marital", "education", "in_default", "housing", "loan", "month", "poutcome"]

# Scenario 3: num features to cat, month to season
num_features = []
cat_features = ["age_cat", "balance_cat", "duration_cat", "campaign_cat", "pdays_cat", "previous_cat",
                  "job", "marital", "education", "in_default", "housing", "loan", "season", "poutcome"]

# Scenario 4: num features to cat, month to season, NO JOB AND CAMPAIGN
num_features = []
cat_features = ["age_cat", "balance_cat", "duration_cat", "pdays_cat", "previous_cat",
                 "marital", "education", "in_default", "housing", "loan", "season", "poutcome"]

# Scenario 5: no in_default
num_features = []
cat_features = ["age_cat", "balance_cat", "duration_cat", "pdays_cat", "previous_cat",
                 "marital", "education", "housing", "loan", "season", "poutcome"]

In [46]:
# CREATING X AND Y MATRICES
num_features = ["age", "balance", "duration", "campaign", "pdays", "previous"]
cat_features = ["job", "marital", "education", "in_default", "housing", "loan", "month", "poutcome"]

X_num = known[num_features].values
ohe = OneHotEncoder(drop="first")
X_result = ohe.fit_transform(known[cat_features])
X_cat = X_result.toarray()
X_array = np.hstack((X_num, X_cat))
X_df = pd.DataFrame(columns=ohe.get_feature_names_out(cat_features))

y_array = known["y"].apply(lambda x: 1 if x == 'yes' else 0).values # To create a NumPy array from a pandas Series, you can use the .values attribute.

#print(X_array.shape)
#print(X_array)
#print(y_array.shape)
#print(y_array)

In [83]:
# GRADIENT BOOST + STRATIFIED K-FOLD WITH UNDERSAMPLING OF NOs
sampler = RandomUnderSampler(sampling_strategy='auto', random_state=100)
X_resampled, y_resampled = sampler.fit_resample(X_array, y_array)

gb = GradientBoostingClassifier(n_estimators=80, random_state=100)
custom_threshold = 0.35
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
y_pred_custom = []

print(f"Average K-Fold CV score: {cross_val_score(gb, X_resampled, y_resampled, cv=skf).mean():.2f}\n")
overall_conf_matrix = np.zeros((2, 2))

for fold, (train_index, test_index) in enumerate(skf.split(X_resampled, y_resampled)):
    fold_X_train, fold_X_test = X_resampled[train_index], X_resampled[test_index]
    fold_y_train, fold_y_test = y_resampled[train_index], y_resampled[test_index]

    gb.fit(fold_X_train, fold_y_train)
    pred_proba = gb.predict_proba(fold_X_test)[:, 1]
    fold_predictions = (pred_proba >= custom_threshold).astype(int)
    y_pred_custom.extend(fold_predictions)
    fold_conf_matrix = confusion_matrix(fold_y_test, fold_predictions)
    fold_class_report = classification_report(fold_y_test, fold_predictions)
    #print(f"Fold {fold + 1} - Confusion Matrix:\n{fold_conf_matrix}\n")
    #print(f"Fold {fold + 1} - Classification Report:\n{fold_class_report}\n")
    overall_conf_matrix += fold_conf_matrix

#average_conf_matrix /= 5
print("Overall Confusion Matrix:")
print(overall_conf_matrix)

Average K-Fold CV score: 0.81

Average Confusion Matrix:
[[364. 157.]
 [ 47. 474.]]


In [48]:
# CREATING X MATRIX FOR PREDICTIONS
X_pred_num = data[num_features].values
ohe = OneHotEncoder(drop="first")
X_result_pred = ohe.fit_transform(data[cat_features])
X_pred_cat = X_result_pred.toarray()
X_pred_array = np.hstack((X_pred_num, X_pred_cat))
X_pred_array[:1]

array([[  43., -127.,  400.,    2.,   -1.,    0.,    0.,    0.,    0.,
           1.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    1.,
           0.,    0.,    1.,    0.,    0.,    0.,    0.,    1.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    1.]])

In [102]:
outcome_prob = gb.predict_proba(X_pred_array)[:, 1]
data["pred_prob"] = outcome_prob
for index, row in data.iterrows():
    y = row["y"]
    pred_prob = row["pred_prob"]
    if y == "yes":
       data.at[index, "label"] = 1
    elif y == "unknown" and pred_prob >= 0.80:
        data.at[index, "label"] = 1
    elif y == "no" and pred_prob >= 0.90:
        data.at[index, "label"] = 1
    else:
      data.at[index, "label"] = 0

print(data.shape)
data.head(1)

print(data.shape)
data.head(1)

(45211, 35)
(45211, 35)


Unnamed: 0,client_id,age,job,marital,education,account_id,in_default,balance,housing,loan,contact,campaign_id,day,month,duration,campaign,pdays,previous,poutcome,y,age_cat,age_log,balance_cat,balance_pt,duration_cat,duration_log,campaign_cat,campaign_log,pdays_cat,pdays_pt,previous_cat,previous_pt,season,pred_prob,label
0,1000583,43,management,married,tertiary,19298,no,-127.0,no,no,cellular,19298,6,aug,400,2,-1,0,unknown,unknown,36-45,3.7612,negative,-0.71255,300-700,5.991465,twice,0.693147,never,-0.472533,never,-0.4725,2,0.560108,0


In [103]:
outcome = data.copy()
outcome = outcome[["client_id", "y", "pred_prob", "label", "season", "age_cat", "balance_cat", "duration_cat", "campaign_cat", "pdays_cat", "previous_cat"]]
outcome.sample(5)

Unnamed: 0,client_id,y,pred_prob,label,season,age_cat,balance_cat,duration_cat,campaign_cat,pdays_cat,previous_cat
9294,2843267,unknown,0.249448,0,2,36-45,0-250,120-300,once,never,never
5990,2191603,unknown,0.77031,0,1,26-35,500-1500,120-300,twice,never,never
43127,9578285,unknown,0.151722,0,2,26-35,500-1500,40-120,3-5,never,never
31461,7247475,unknown,0.214201,0,2,26-35,250-500,120-300,once,never,never
30719,7106870,unknown,0.13215,0,2,46-60,500-1500,120-300,3-5,never,never


In [104]:
outcome[outcome["label"] == 1].shape

(5230, 11)

In [84]:
outcome_known = outcome[(outcome["y"] == "no") & (outcome["pred_prob"] >= custom_threshold)]
print(outcome_known.shape)
outcome_known

(1092, 11)


Unnamed: 0,client_id,y,pred_prob,label,season,age_cat,balance_cat,duration_cat,campaign_cat,pdays_cat,previous_cat
30,1004963,no,0.672132,1,1,36-45,3000-6000,300-700,once,never,never
57,1011837,no,0.449758,1,1,36-45,0-250,120-300,twice,never,never
67,1014610,no,0.351494,1,2,26-35,500-1500,120-300,once,0-3mon,4-10
91,1018924,no,0.798002,1,2,26-35,250-500,300-700,3-5,never,never
148,1031140,no,0.635740,1,2,26-35,0-250,300-700,3-5,never,never
...,...,...,...,...,...,...,...,...,...,...,...
44851,9926548,no,0.527334,1,1,46-60,6000-15000,120-300,once,3-6mon,1-3
44915,9940251,no,0.429021,1,2,46-60,250-500,120-300,once,0-3mon,1-3
44939,9945695,no,0.692179,1,1,26-35,negative,300-700,once,6mon+,1-3
45120,9981724,no,0.689494,1,1,36-45,1500-3000,120-300,once,3-6mon,never
