## Bank Churners

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import constants
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from graphs import plot_learning_curve

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
data = pd.read_csv('./datasets/BankChurners.csv')

# Define the target column and set random_state
target_column = 'Attrition_Flag'
random_state = constants.RANDOM_STATE
test_size = constants.TEST_SIZE

# Extract features (X) and target variable (y)

# List of categorical columns to keep
columns_to_keep = ["Customer_Age", "Gender", "Education_Level", "Marital_Status", "Income_Category"]

# Drop all other categorical columns
X = data[columns_to_keep]
y = data[target_column]

num_entries = data.shape[0]
print("Number of entries:", num_entries)
num_features = X.shape[1]
print("Number of features:", num_features)

# Encode categorical features using one-hot encoding
categorical_cols = ["Gender", "Education_Level", "Marital_Status", "Income_Category"]
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Perform undersampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=random_state)
X_resampled, y_resampled = undersampler.fit_resample(X_encoded, y)

sns.countplot(data=pd.DataFrame({'Attrition_Flag': y_resampled}), x='Attrition_Flag')
plt.xlabel('Attrition_Flag')
plt.ylabel('Count')
plt.title('Attrition Flag Counts (After Undersampling)')
plt.show()

counts = pd.Series(y_resampled).value_counts()

# Now 'counts' contains the counts of each category ('Existing Customer' and 'Attrited Customer')
print(counts)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=random_state)


### Decision Trees

In [None]:
from dt import dt
best_dt_model, best_dt_params = dt(X_train, X_test, y_train, y_test, constants.CV)

In [None]:
from graphs import plot_learning_curve
plot_learning_curve(best_dt_model, X_train, y_train, cv=constants.CV, scoring='accuracy', title="Decision Tree Learning Curve")

### Neural Networks

In [None]:
from nn import nn
best_nn_model, best_nn_params = nn(X_train, X_test, y_train, y_test, constants.CV)

In [None]:

plot_learning_curve(best_nn_model, X_train, y_train, cv=constants.CV, scoring='accuracy', title="Decision Tree Learning Curve")