## Try lang tong lahat


## Import


In [22]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [23]:
df = pd.read_parquet('parquet/kess.parquet')

#### Drop columns


In [24]:
column_names = df.columns.tolist()
print("Column names:", column_names)

Column names: ['ID', 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Response', 'Complain', 'Total_Children', 'Days since Customer', 'Age_at_Customer_Date', 'Basic', 'Graduation', 'Master', 'PhD', 'Married', 'Single', 'Together', 'Widow']


In [25]:
# df.drop(columns=['ID', 'Year_Birth', 'Kidhome',
#         'Teenhome', 'Dt_Customer'], inplace=True)
df.drop(columns=['ID', 'Year_Birth', 'Total_Children',
        'Dt_Customer'], inplace=True)
df

Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,Days since Customer,Age_at_Customer_Date,Basic,Graduation,Master,PhD,Married,Single,Together,Widow
0,84835.0,0,0,0,189,104,379,111,189,218,...,3538,44,False,True,False,False,False,False,False,False
1,57091.0,0,0,0,464,5,64,7,0,37,...,3539,53,False,True,False,False,False,True,False,False
2,67267.0,0,1,0,134,11,59,15,2,30,...,3572,56,False,True,False,False,True,False,False,False
3,32474.0,1,1,0,10,0,1,0,0,0,...,3396,47,False,True,False,False,False,False,True,False
4,21474.0,1,0,0,6,16,24,11,0,34,...,3489,25,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,66476.0,0,1,99,372,18,126,47,48,78,...,3886,37,False,False,False,True,False,False,False,False
2197,31056.0,1,0,99,5,10,13,3,8,16,...,4048,36,False,False,False,False,True,False,False,False
2198,46310.0,1,0,99,185,2,88,15,5,14,...,4364,36,False,True,False,False,False,False,False,False
2199,65819.0,0,0,99,267,38,701,149,165,63,...,4102,34,False,True,False,False,True,False,False,False


## Baseline Modelling


In [26]:
X = df.drop('Response', axis=1).iloc[:, 1:]
y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

In [27]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)

# clf = LogisticRegression(random_state=0).fit(X_train, y_train)
# preds = clf.predict(scaler.transform(X_test))

# acc = accuracy_score(y_test, preds)
# prec = precision_score(y_test, preds)
# rec = recall_score(y_test, preds)
# f1 = f1_score(y_test, preds)
# auc = roc_auc_score(y_test, preds)

In [28]:
# print("Accuracy: %.4f" % acc)
# print("Precision: %.4f" % prec)
# print("Recall: %.4f" % rec)
# print("F1: %.4f" % f1)
# print("AUC: %.4f" % auc)

In [29]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifiers = {
    'Logistic Regression': LogisticRegression(random_state=0),
    'SVM': SVC(random_state=0, probability=True),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

scores = {name: {} for name in classifiers.keys()}

for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    preds = clf.predict(X_test_scaled)

    scores[name]['Accuracy'] = accuracy_score(y_test, preds)
    scores[name]['Precision'] = precision_score(y_test, preds)
    scores[name]['Recall'] = recall_score(y_test, preds)
    scores[name]['F1 Score'] = f1_score(y_test, preds)

    if hasattr(clf, "predict_proba"):
        pred_probs = clf.predict_proba(X_test_scaled)[:, 1]
        scores[name]['ROC AUC'] = roc_auc_score(y_test, pred_probs)
    else:
        scores[name]['ROC AUC'] = 'N/A'

scores_df = pd.DataFrame(scores).T
scores_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.859304,0.542857,0.383838,0.449704,0.84232
SVM,0.865356,0.708333,0.171717,0.276423,0.845654
Naive Bayes,0.741301,0.3,0.545455,0.387097,0.718789
Decision Tree,0.83056,0.441441,0.494949,0.466667,0.691416
K-Nearest Neighbors,0.859304,0.615385,0.161616,0.256,0.735136


## Small Findings


Models result with better scores when you have columns kidhome and teenhome rather than total children
