In [None]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy.stats import uniform

In [None]:
data = pd.read_csv('Churn_Modelling.csv')

cleaned_data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

cleaned_data = pd.get_dummies(cleaned_data, columns=['Geography', 'Gender'], drop_first=True)

scaler = StandardScaler()
numeric_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
cleaned_data[numeric_columns] = scaler.fit_transform(cleaned_data[numeric_columns])

data.to_csv('cleaned.csv', index=False)

In [None]:
print(cleaned_data.head())

In [None]:
cleaned_data = pd.read_csv('cleaned.csv')
X = cleaned_data.drop(['Exited', 'Surname'], axis=1)
X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)
y = cleaned_data['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, test_size=0.8,
                                                          random_state=42)  # Sample 20%

X_test_sampled, y_test_sampled = X_test.sample(frac=0.2, random_state=42), y_test.sample(frac=0.2,
                                                                                         random_state=42)  

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    'SVM': SVC()
}

param_distributions = {
    'Logistic Regression': {'C': uniform(0.01, 10)},
    'Random Forest': {'n_estimators': [50], 'max_depth': [5]},
    'SVM': {'C': [1], 'kernel': ['linear']}
}

best_models_random = {}
for model_name, model in models.items():
    print(f"Training {model_name} with RandomizedSearchCV...")
    random_search = RandomizedSearchCV(model, param_distributions[model_name], n_iter=5, cv=2, scoring='accuracy',
                                       n_jobs=-1, random_state=42)
    random_search.fit(X_train_sampled, y_train_sampled)
    best_models_random[model_name] = random_search.best_estimator_

best_models_grid = {}
for model_name, model in best_models_random.items():
    print(f"Fine-tuning {model_name} with GridSearchCV...")

    if isinstance(model, LogisticRegression):
        param_grid = {'C': [0.01, 0.1, 1]}
    elif isinstance(model, RandomForestClassifier):
        param_grid = {'n_estimators': [50], 'max_depth': [5]}
    elif isinstance(model, SVC):
        param_grid = {'C': [1], 'kernel': ['linear']}

    grid_search = GridSearchCV(model, param_grid, cv=2, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_sampled, y_train_sampled)  
    best_models_grid[model_name] = grid_search.best_estimator_

model_reports = {}
for model_name, model in best_models_grid.items():
    print(f"Evaluating {model_name}...")
    y_pred = model.predict(X_test_sampled)
    model_reports[model_name] = classification_report(y_test_sampled, y_pred)
    print(f"\nClassification Report for {model_name}:\n")
    print(model_reports[model_name])

best_models_grid


In [None]:
data_cleaned = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

label_encoder_geo = LabelEncoder()
label_encoder_gender = LabelEncoder()
data_cleaned['Geography'] = label_encoder_geo.fit_transform(data_cleaned['Geography'])
data_cleaned['Gender'] = label_encoder_gender.fit_transform(data_cleaned['Gender'])

X_nn = data_cleaned.drop('Exited', axis=1)
y_nn = data_cleaned['Exited']

scaler = StandardScaler()
X_scaled_nn = scaler.fit_transform(X_nn)

X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_scaled_nn, y_nn, test_size=0.2, random_state=42)

model_nn = Sequential()
model_nn.add(Dense(units=16, activation='relu', input_dim=X_train_nn.shape[1]))
model_nn.add(Dense(units=8, activation='relu'))
model_nn.add(Dense(units=1, activation='sigmoid'))

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history_nn = model_nn.fit(X_train_nn, y_train_nn, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

test_loss_nn, test_accuracy_nn = model_nn.evaluate(X_test_nn, y_test_nn)
y_pred_nn = (model_nn.predict(X_test_nn) > 0.5).astype("int32")

print(f"\nNeural Network Test Accuracy: {test_accuracy_nn:.4f}")
print(f"Classification Report for Neural Network:\n")
print(classification_report(y_test_nn, y_pred_nn))

In [None]:
best_accuracy = 0

for model_name, model in best_models_grid.items():
    y_pred = model.predict(X_test_sampled)
    accuracy = accuracy_score(y_test_sampled, y_pred)
    print(f"\n{model_name} Test Accuracy: {accuracy:.4f}")
    print(f"Classification Report for {model_name}:\n")
    print(classification_report(y_test_sampled, y_pred))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_classic_model = model_name

print(f"\nBest Classic ML Model: {best_classic_model} with accuracy of {best_accuracy:.4f}")

# *** COMPARISON OF NEURAL NETWORK AND BEST CLASSIC ML MODEL ***
print("\nComparison of the Best Classic ML Model and the Neural Network:")
print(f"Best Classic ML Model: {best_classic_model} Accuracy: {best_accuracy:.4f}")
print(f"Neural Network Accuracy: {test_accuracy_nn:.4f}")