In [38]:
import pandas as pd

def load_data(filepath):
    df = pd.read_csv(filepath)
    return df

df = load_data('../data/BankChurners.csv')


# Preprocessing

### Cleaning out mess

In [39]:
def clean_df(df):
    df = df[df.columns[:-2]]
    df = df.drop(['CLIENTNUM'], axis=1)
    return df

df = clean_df(df)

### Rebalancing the target variable (SMOTE)

### Splitting

In [40]:
from sklearn.model_selection import train_test_split
# Train test split
X = df.drop('Attrition_Flag', axis=1)
y = df['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model

In [41]:
# Identifying categoricals and numericals
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_train.select_dtypes(exclude=['object', 'category']).columns

pipeline creation

In [42]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Numerical pipeline
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()  # This will standardize the numeric data
)

# ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', numerical_pipeline, numerical_cols) 
    ],
    remainder='passthrough'
)

logreg = LogisticRegression(max_iter=1000)

# A pipeline that includes the above
logpipe = make_pipeline(preprocessor, logreg)



Fitting

In [44]:
logpipe.fit(X_train, y_train)

### Evaluation

In [45]:
y_pred = logpipe.predict(X_test)

R²

In [46]:
print("train_score: ", round(logpipe.score(X_train, y_train), 3), "\ntest_score: ", round(logpipe.score(X_test, y_test), 2))

train_score:  0.905 
test_score:  0.9


ROC Area Under Curve

In [47]:
from sklearn.metrics import roc_auc_score

print("ROC_AUC: ", round(roc_auc_score(y_test, logpipe.predict_proba(X_test)[:, 1]), 3))


ROC_AUC:  0.917


Classification report

In [48]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

                   precision    recall  f1-score   support

Attrited Customer       0.76      0.54      0.63       327
Existing Customer       0.92      0.97      0.94      1699

         accuracy                           0.90      2026
        macro avg       0.84      0.75      0.79      2026
     weighted avg       0.89      0.90      0.89      2026



Crossvalidation check

In [49]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(logpipe, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True)

print("Mean Test Accuracy:", round(cv_results['test_score'].mean(), 3))
print("Mean Train Accuracy:", round(cv_results['train_score'].mean(), 3))
print("Mean Fit Time:", round(cv_results['fit_time'].mean(), 3))
print("Mean Score Time:", round(cv_results['score_time'].mean(), 3))


Mean Test Accuracy: 0.905
Mean Train Accuracy: 0.906
Mean Fit Time: 0.073
Mean Score Time: 0.011
