In [25]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings as w 
w.filterwarnings('ignore')
sns.set(style='whitegrid')  


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC

df = pd.read_csv('../artifacts/raw.csv', sep=';')
df['y']= df['y'].map({'yes': 1, 'no': 0})

In [26]:
X= df.drop(columns='y', axis=1)
y= df['y']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2 , stratify=y)
num_cols= X.select_dtypes(include=['number']).columns.tolist()
cat_cols= X.select_dtypes(include=['object']).columns.tolist()
num_pipe= Pipeline([('impute', SimpleImputer(strategy='mean')),
                    ('scaler', StandardScaler())])

cat_pipe= Pipeline([('impute', SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder())])
preprocessing = ColumnTransformer([('num', num_pipe, num_cols),
                                   ('cat', cat_pipe, cat_cols)])


In [27]:
preprocessing.fit(X_train)
train_processed = preprocessing.transform(X_train)
test_processed = preprocessing.transform(X_test)

sm = SMOTE(random_state=42)
X_ref, y_ref = sm.fit_resample(train_processed, y_train)

models = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'XGBClassifier': XGBClassifier()
}

for name, model in models.items():
    model.fit(X_ref, y_ref)

    train_pred = model.predict(train_processed)
    test_pred = model.predict(test_processed)

    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)

    print(f"{name} - Train Accuracy: {train_acc:.4f}")
    print(f"{name} - Test Accuracy:  {test_acc:.4f}\n")

    train_pre = precision_score(y_train, train_pred, pos_label=1)
    test_pre = precision_score(y_test, test_pred, pos_label=1)

    print(f"{name} - Train precision_score: {train_pre:.4f}")
    print(f"{name} - Test precision_score:  {test_pre:.4f}\n")

    train_rc = recall_score(y_train, train_pred, pos_label=1)
    test_rc = recall_score(y_test, test_pred, pos_label=1)

    print(f"{name} - Train recall_score: {train_rc:.4f}")
    print(f"{name} - Test recall_score:  {test_rc:.4f}\n")


LogisticRegression - Train Accuracy: 0.8466
LogisticRegression - Test Accuracy:  0.8467

LogisticRegression - Train precision_score: 0.4197
LogisticRegression - Test precision_score:  0.4192

LogisticRegression - Train recall_score: 0.8128
LogisticRegression - Test recall_score:  0.8043

KNeighborsClassifier - Train Accuracy: 0.9021
KNeighborsClassifier - Test Accuracy:  0.8434

KNeighborsClassifier - Train precision_score: 0.5445
KNeighborsClassifier - Test precision_score:  0.4063

KNeighborsClassifier - Train recall_score: 0.9981
KNeighborsClassifier - Test recall_score:  0.7335

DecisionTreeClassifier - Train Accuracy: 1.0000
DecisionTreeClassifier - Test Accuracy:  0.8670

DecisionTreeClassifier - Train precision_score: 1.0000
DecisionTreeClassifier - Test precision_score:  0.4445

DecisionTreeClassifier - Train recall_score: 1.0000
DecisionTreeClassifier - Test recall_score:  0.5491

XGBClassifier - Train Accuracy: 0.9471
XGBClassifier - Test Accuracy:  0.9072

XGBClassifier - Tr

In [28]:
clf= XGBClassifier()
model = make_pipeline(preprocessing, clf)
X_train_copy = X_train.copy()
y_train_copy = y_train.copy()
sm= SMOTEN()
X_ref_copy,y_ref_copy = sm.fit_resample(X_train_copy, y_train_copy)
model.fit(X_ref_copy, y_ref_copy) 
y_pred = model.predict(X_test)

ref_df=pd.DataFrame({"Actual": y_test,
              "Predicted": y_pred})

In [29]:
ref_df.head(7)

Unnamed: 0,Actual,Predicted
1392,0,0
7518,0,0
12007,0,0
5536,0,0
29816,0,0
18275,0,0
8543,0,0


In [30]:
ref_df.value_counts()

Actual  Predicted
0       0            7311
        1             674
1       1             637
        0             421
Name: count, dtype: int64