In [54]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings as w 
w.filterwarnings('ignore')
sns.set(style='whitegrid')  


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTEN

df = pd.read_csv('../artifacts/raw.csv', sep=';')

In [55]:
X= df.drop(columns='y', axis=1)
y= df['y']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2 , stratify=y)
num_cols= X.select_dtypes(include=['number']).columns.tolist()
cat_cols= X.select_dtypes(include=['object']).columns.tolist()
num_pipe= Pipeline([('impute', SimpleImputer(strategy='mean')),
                    ('scaler', StandardScaler())])

cat_pipe= Pipeline([('impute', SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder())])
preprocessing = ColumnTransformer([('num', num_pipe, num_cols),
                                   ('cat', cat_pipe, cat_cols)])

sm= SMOTEN()
X_ref, y_ref = sm.fit_resample(X_train,y_train) 

In [56]:

preprocessing.fit(X_train)
train_processed = preprocessing.transform(X_train)
test_processed = preprocessing.transform(X_test)

sm = SMOTEN(random_state=42)
X_ref, y_ref = sm.fit_resample(train_processed, y_train)

models = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_ref, y_ref)

    train_pred = model.predict(train_processed)
    test_pred = model.predict(test_processed)

    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)

    print(f"{name} - Train Accuracy: {train_acc:.4f}")
    print(f"{name} - Test Accuracy:  {test_acc:.4f}\n")


LogisticRegression - Train Accuracy: 0.9008
LogisticRegression - Test Accuracy:  0.9010

KNeighborsClassifier - Train Accuracy: 0.9032
KNeighborsClassifier - Test Accuracy:  0.8717

DecisionTreeClassifier - Train Accuracy: 1.0000
DecisionTreeClassifier - Test Accuracy:  0.8635



In [None]:
clf= LogisticRegression()
model = make_pipeline(preprocessing, clf)
X_train_copy = X_train.copy()
y_train_copy = y_train.copy()
sm= SMOTEN()
X_ref_copy,y_ref_copy = sm.fit_resample(X_train_copy, y_train_copy)
model.fit(X_ref_copy, y_ref_copy) 
y_pred = model.predict(X_test)

ref_df=pd.DataFrame({"Actual": y_test,
              "Predicted": y_pred})

In [66]:
ref_df.head(7)

Unnamed: 0,Actual,Predicted
1392,no,no
7518,no,no
12007,no,no
5536,no,no
29816,no,no
18275,no,no
8543,no,no
