In [93]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [79]:
df = pd.read_csv('./Dataset/processed_dataset.csv')

In [80]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668.0,France,Male,33.0,3.0,0.0,2.0,1.0,0.0,181449.97,0.0
1,627.0,France,Male,33.0,1.0,0.0,2.0,1.0,1.0,49503.5,0.0
2,678.0,France,Male,40.0,10.0,0.0,2.0,1.0,0.0,184866.69,0.0
3,581.0,France,Male,34.0,2.0,148882.54,1.0,1.0,1.0,84560.88,0.0
4,716.0,Spain,Male,33.0,5.0,0.0,2.0,1.0,1.0,15068.83,0.0


In [81]:
X = df.drop(columns=['Exited'])
y = df['Exited']

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((211208, 10), (52802, 10), (211208,), (52802,))

In [84]:
encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = encoder.fit_transform(X_train[['Gender', 'Geography']])
X_test_encoded = encoder.transform(X_test[['Gender', 'Geography']])
train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(['Gender', 'Geography']))
test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(['Gender', 'Geography']))
X_train = pd.concat([X_train.reset_index(drop=True).drop(columns=['Gender', 'Geography']), train_encoded_df], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True).drop(columns=['Gender', 'Geography']), test_encoded_df], axis=1)



In [85]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((211208, 11), (52802, 11), (211208,), (52802,))

In [86]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [87]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [88]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

Random Forest

In [89]:
model = RandomForestClassifier(random_state=42)
model = train_model(model, X_train, y_train)

In [90]:
evaluate_model(model, X_test, y_test)

0.9140941630998826

XGBoost

In [91]:
model = xgb.XGBClassifier(random_state=42)
model = train_model(model, X_train, y_train)

In [92]:
evaluate_model(model, X_test, y_test)

0.9166698231127609

Logistic Regression

In [94]:
model = LogisticRegression(random_state=42)
model = train_model(model, X_train, y_train)

In [95]:
evaluate_model(model, X_test, y_test)

0.8835460778000833