In [14]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import torch
import copy
import itertools
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_predict
from sklearn.compose import ColumnTransformer
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


In [5]:

df1 = pd.read_csv('churn.csv')
# Preprocess the data
data_prep = df1.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
categorical_features = ['Geography', 'Gender']
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = data_prep.drop('Exited', axis=1)
y = data_prep['Exited']
X_preprocessed = preprocessor.fit_transform(X)

pca = PCA(n_components= 2).fit(X_preprocessed)
X_pca = pca.transform(X_preprocessed)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

In [15]:

# Define the neural network architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.sigmoid(out)
        return out

In [16]:

# Training and evaluation function
def train_and_evaluate_model(params, X_train, y_train, X_test, y_test, device):
    hidden_size, learning_rate, batch_size = params
    model = NeuralNetwork(X_train.shape[1], hidden_size).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.FloatTensor(y_train.to_numpy()).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    
    dataset = TensorDataset(X_train_tensor, y_train_tensor)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    num_epochs = 5
    for epoch in range(num_epochs):
        for inputs, targets in loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor).squeeze()
        y_pred_proba = outputs.cpu().numpy()
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
    
    return fpr, tpr, roc_auc

In [17]:
# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
#use GS to find best hyper parameter comb
def evaluate_model_auc(params, X_train, y_train, X_test, y_test, device):
    model = NeuralNetwork(X_train.shape[1], params['hidden_size']).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.FloatTensor(y_train.to_numpy()).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    
    dataset = TensorDataset(X_train_tensor, y_train_tensor)
    loader = DataLoader(dataset, batch_size=params['batch_size'], shuffle=True)
    
    num_epochs = params['num_epochs']
    for epoch in range(num_epochs):
        for inputs, targets in loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor).squeeze()
        y_pred_proba = outputs.cpu().numpy()
    
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score

# Define the grid of hyperparameters to search
param_grid = {
    'hidden_size': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'batch_size': [128, 256],
    'num_epochs': [5]  
}

# Generate all combinations of hyperparameters
param_combinations = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

# Search for the best hyperparameters
best_score = 0
best_params = None
for params in param_combinations:
    score = evaluate_model_auc(params, X_train, y_train, X_test, y_test, device)
    if score > best_score:
        best_score = score
        best_params = params

print(f"Best AUC: {best_score}")
print(f"Best hyperparameters: {best_params}")


Best AUC: 0.6992680078018688
Best hyperparameters: {'hidden_size': 100, 'learning_rate': 0.01, 'batch_size': 128, 'num_epochs': 5}


In [10]:
best_params = {
    'hidden_size': 100, 
    'learning_rate': 0.01,
    'batch_size': 128,
    'num_epochs': 5 
}

# Train the model with the best hyperparameters and evaluate on the test set
test_auc = evaluate_model_auc(best_params, X_train, y_train, X_test, y_test, device)

print(f"Test AUC with Best Hyperparameters: {test_auc}")


Test AUC with Best Hyperparameters: 0.7032653315794248


In [None]:
#Random forest

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
#Random forest 
df1 = pd.read_csv('churn.csv')
# Preprocess the data
data_prep = df1.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
categorical_features = ['Geography', 'Gender']
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = data_prep.drop('Exited', axis=1)
y = data_prep['Exited']
X_preprocessed = preprocessor.fit_transform(X)


X_train1_b, X_test1_b, y_train1_b, y_test1_b = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
X_train1, X_test1, y_train1, y_test1 = X_train1_b, X_test1_b, y_train1_b, y_test1_b
y_train1_b.sum(), y_test1_b.sum()

scaler = StandardScaler()
scaler.fit(X_train1_b)
scaler.mean_
X_train1 = scaler.transform(X_train1_b)
print("Train: ", X_train1.shape, "Positive examples: ", y_train1.sum())

X_test1 = scaler.transform(X_test1_b)
print("Test: ", X_test1.shape, "Positive examples: ", y_test1.sum())

df2 = df1.copy()
print(df1.columns)


model = RandomForestClassifier(n_estimators=100, max_depth=3)
model.fit(X_train1, y_train1)
importance = model.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
importance = sorted(importance)
print(importance)
# Feature  (CustomerId,Geography,Gender,HasCrCard)
mask_d1 = np.array(importance)>0.05
print(mask_d1)

Train:  (8000, 13) Positive examples:  1644
Test:  (2000, 13) Positive examples:  393
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
Feature: 0, Score: 0.01605
Feature: 1, Score: 0.40014
Feature: 2, Score: 0.00076
Feature: 3, Score: 0.04079
Feature: 4, Score: 0.29500
Feature: 5, Score: 0.00027
Feature: 6, Score: 0.10943
Feature: 7, Score: 0.00524
Feature: 8, Score: 0.02211
Feature: 9, Score: 0.06027
Feature: 10, Score: 0.00859
Feature: 11, Score: 0.01756
Feature: 12, Score: 0.02380
[0.0002681635905900325, 0.0007614354612305828, 0.005236656890911803, 0.00859231085285973, 0.016045863815106132, 0.017557621787925096, 0.02210892876117666, 0.023798909244274503, 0.04079388593147097, 0.06026951548306544, 0.10943467599359048, 0.29499533970203184, 0.40013669248576683]
[False False False False False False False False Fals

In [16]:
df3=pd.read_csv("churn.csv")
DF_RF = df3[['CustomerId', 'Geography', 'Gender', 'HasCrCard','Exited']]
# Replace categorical variables with numeric values
# Preprocess the data
data_prep = DF_RF.drop([ 'CustomerId'], axis=1)
categorical_features = ['Geography', 'Gender']
numerical_features = ['HasCrCard']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = data_prep.drop('Exited', axis=1)
y = data_prep['Exited']
X_preprocessed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

In [17]:
# Define the grid of hyperparameters to search
param_grid = {
    'hidden_size': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'batch_size': [128, 256],
    'num_epochs': [5]  
}

# Generate all combinations of hyperparameters
param_combinations = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

# Search for the best hyperparameters
best_score = 0
best_params = None
for params in param_combinations:
    score = evaluate_model_auc(params, X_train, y_train, X_test, y_test, device)
    if score > best_score:
        best_score = score
        best_params = params

print(f"Best AUC: {best_score}")
print(f"Best hyperparameters: {best_params}")

Best AUC: 0.6366337005579243
Best hyperparameters: {'hidden_size': 50, 'learning_rate': 0.1, 'batch_size': 128, 'num_epochs': 5}


In [18]:
best_params = {
    'hidden_size': 50, 
    'learning_rate': 0.1,
    'batch_size': 128,
    'num_epochs': 5 
}

# Train the model with the best hyperparameters and evaluate on the test set
test_auc = evaluate_model_auc(best_params, X_train, y_train, X_test, y_test, device)

print(f"Test AUC with Best Hyperparameters: {test_auc}")

Test AUC with Best Hyperparameters: 0.6319736998775288


In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import resample
from sklearn.inspection import permutation_importance

from scipy.stats import pointbiserialr, chi2_contingency
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

import time
from functools import wraps

import itertools
import matplotlib.backends.backend_pdf

In [12]:
df3=pd.read_csv("churn.csv")
# Exclude the target variable from the clustering
df_cluster = df3.drop(columns=['Exited','Geography', 'Gender','Surname'])

# Compute the correlation matrix
corr_matrix = df_cluster.corr().abs()
corr_matrix = corr_matrix.clip(lower=-1, upper=1)

# Turn the correlation matrix into a distance matrix
dist_matrix = 1 - corr_matrix

# Perform hierarchical/agglomerative clustering
clusters = linkage(squareform(dist_matrix), method="average")

# Form flat clusters from the hierarchical clustering defined by the linkage matrix
num_clusters = df_cluster.shape[1] // 3
cluster_labels = fcluster(clusters, num_clusters, criterion="maxclust")

# Select the most representative variable from each cluster
selected_features = []
for i in range(1, num_clusters + 1):
    cluster_vars = [
        var
        for var, cluster in zip(df_cluster.columns, cluster_labels) 
        if cluster == i
    ]

    # Select the variable with the highest sum of correlations with other variables in the cluster
    var_correlations = corr_matrix.loc[cluster_vars, cluster_vars].sum()
    most_representative = var_correlations.idxmax()
    selected_features.append(most_representative)
    # Add other variables in the cluster that have a correlation less than 50% with the most representative variable 
    for var in cluster_vars:
         if (
            var != most_representative
            and corr_matrix.loc[most_representative, var] < 0.5 
        ):
            selected_features.append(var)

# Update the Dataframe to include only the selected features, along with the target variable
    #df3 = df3[selected_features + 'Exited']
print(selected_features)

df_clustered=df3[['Geography', 'Gender','Balance', 'CustomerId', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'CreditScore', 'RowNumber','Exited']]


['Balance', 'CustomerId', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'CreditScore', 'RowNumber']


In [19]:
# Replace categorical variables with numeric values
# Preprocess the data
data_prep = df_clustered.drop([ 'CustomerId', 'RowNumber'], axis=1)
categorical_features = ['Geography', 'Gender']
numerical_features = ['Balance', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'CreditScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = data_prep.drop('Exited', axis=1)
y = data_prep['Exited']
X_preprocessed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

In [20]:
def evaluate_model_auc(params, X_train, y_train, X_test, y_test, device):
    model = NeuralNetwork(X_train.shape[1], params['hidden_size']).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.FloatTensor(y_train.to_numpy()).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    
    dataset = TensorDataset(X_train_tensor, y_train_tensor)
    loader = DataLoader(dataset, batch_size=params['batch_size'], shuffle=True)
    
    num_epochs = params['num_epochs']
    for epoch in range(num_epochs):
        for inputs, targets in loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor).squeeze()
        y_pred_proba = outputs.cpu().numpy()
    
    auc_score = roc_auc_score(y_test, y_pred_proba)
    return auc_score


In [21]:
# Define the grid of hyperparameters to search
param_grid = {
    'hidden_size': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'batch_size': [128, 256],
    'num_epochs': [5]  
}

# Generate all combinations of hyperparameters
param_combinations = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

# Search for the best hyperparameters
best_score = 0
best_params = None
for params in param_combinations:
    score = evaluate_model_auc(params, X_train, y_train, X_test, y_test, device)
    if score > best_score:
        best_score = score
        best_params = params

print(f"Best AUC: {best_score}")
print(f"Best hyperparameters: {best_params}")

Best AUC: 0.859242464619432
Best hyperparameters: {'hidden_size': 100, 'learning_rate': 0.1, 'batch_size': 128, 'num_epochs': 5}


In [22]:
best_params = {
    'hidden_size': 100, 
    'learning_rate': 0.1,
    'batch_size': 128,
    'num_epochs': 5 
}

# Train the model with the best hyperparameters and evaluate on the test set
test_auc = evaluate_model_auc(best_params, X_train, y_train, X_test, y_test, device)

print(f"Test AUC with Best Hyperparameters: {test_auc}")

Test AUC with Best Hyperparameters: 0.8576095153315795
