In [None]:
import pandas as pd 
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN

In [None]:
pd.set_option('display.max_columns', None)

## Dataset Info

In [None]:
!wget https://raw.githubusercontent.com/ivtipm/ML/main/datasets/churn.csv -O churn.csv

In [None]:
df = pd.read_csv("churn.csv")
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print(f"Duplicates: №{df.duplicated().sum()}")

## Graphs

Классы несбалансированны

In [None]:
sb.histplot(df['churn'], discrete=True)

In [None]:
sb.histplot(df['Income_Category'], discrete=True)

In [None]:
plt.figure(figsize=(15,7))
sb.histplot( data=df, x='Customer_Age', hue='churn')

In [None]:
plt.figure(figsize=(15,7))
sb.histplot( data=df, x='Customer_Age', hue='Income_Category')

Клиенты с низким объемом переводов уходят чаще

In [None]:
plt.figure(figsize=(15,7))
sb.histplot( data=df, x='Total_Trans_Ct', hue='churn')

In [None]:
def get_numerical_cols(df: pd.DataFrame):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_df = df.select_dtypes(include=numerics)    
    num_cols = num_df.columns
    return num_cols

def get_obj_cols(df: pd.DataFrame):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    obj_df = df.select_dtypes(exclude=numerics)
    obj_cols = obj_df.columns
    return obj_cols

In [None]:
from math import ceil

num_cols = get_numerical_cols(df)
num_colsn_n = len(num_cols)

ncols = 3
nrows = ceil(num_colsn_n/ncols)

fig, axs = plt.subplots(nrows=nrows, ncols=ncols,  figsize=(16, 16))
for i, c in enumerate(num_cols):
    sb.boxplot(df[c].values, ax=axs[i % nrows, i % ncols]).set_title(c)

In [None]:
sb.boxplot(data = df, x = 'churn', y='Total_Trans_Amt', hue='Gender')

In [None]:
sb.boxplot(data = df, x = 'churn', y='Avg_Utilization_Ratio', hue='Gender')

In [None]:
#sb.violinplot(x=df["Total_Revolving_Bal"])

In [None]:
sb.pairplot(df, hue='churn')

In [None]:
#sb.pairplot(df, hue='Income_Category')

Корреляция признаков

In [None]:
plt.figure(figsize=(8, 8))
sb.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
df.columns

## Clean dataset

In [None]:
df_cl = df.copy()

In [None]:
obj_cols = get_obj_cols(df_cl)
for c in obj_cols:
    print(f"Unique values of {c}: {df_cl[c].unique()}")

In [None]:
# # удалим неизвестные категории доходов
# df_cl = df_cl[df_cl['Income_Category'] != 'Unknown']

# # удалим неизвестный семейный статус
# df_cl = df_cl[df_cl['Marital_Status'] != 'Unknown']

# # удалим неизвестный семейный статус
# df_cl= df_cl[df_cl['Marital_Status'] != 'Unknown']

# # удалим неизвестный уровень образования
# df_cl = df_cl[df_cl['Education_Level'] != 'Unknown']

df_cl = df_cl.drop(columns=['Education_Level'])

obj_cols = get_obj_cols(df_cl)
for c in obj_cols:
    print(f"Unique values of {c}: {df_cl[c].unique()}")

In [None]:
#cols_to_drop=['Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Total_Trans_Ct', 'Avg_Open_To_Buy']
cols_to_drop=['Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal']
df_cl.drop(columns=cols_to_drop, inplace=True)

plt.figure(figsize=(8, 8))
sb.heatmap(df_cl.corr(), annot=True, cmap='coolwarm', fmt=".2f")

## Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer, LabelBinarizer, LabelEncoder, Normalizer, RobustScaler, QuantileTransformer, minmax_scale

In [None]:
df_enc = df_cl.copy()

In [None]:
# Энкодинг нескольких классов
le = LabelEncoder()
df_enc['Income_Category'] = le.fit_transform(df_enc['Income_Category'])    

# Энкодинг двух классов
lb = LabelBinarizer()
df_enc['churn'] = lb.fit_transform(df_enc['churn'])    

In [None]:
obj_cols = get_obj_cols(df_enc)

ohe_cols = []
# OHE на другие признаки
for c in obj_cols:
    ohe = OneHotEncoder()
    t = ohe.fit_transform(df_enc[[c]])    
    df_enc[ohe.categories_[0]] = t.toarray()
    ohe_cols += list(ohe.categories_[0])

# Удалим obj cols после энкодинга    
df_enc.drop(columns=obj_cols, inplace=True)
df_enc

In [None]:
ohe_cols += ['churn', 'Income_Category']
ohe_cols

In [None]:
transform_cols =  list(set(df_enc.columns.tolist()) - set(ohe_cols))
transform_cols

In [None]:
qt = QuantileTransformer()      
df_enc[transform_cols] = qt.fit_transform(df_enc[transform_cols])      

In [None]:
# rs = RobustScaler()      
# df_enc[transform_cols] = rs.fit_transform(df_enc[transform_cols])      

In [None]:
# n = Normalizer(norm='max')      
# df_enc[ohe_cols] = n.fit_transform(df_enc[ohe_cols])      

In [None]:
df_enc[ohe_cols] = minmax_scale(df_enc[ohe_cols])

In [None]:
# scaler = StandardScaler()  
# df_enc[transform_cols] = scaler.fit_transform(df_enc[transform_cols])      

In [None]:
df_enc

## Datasets

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

### Base

In [None]:
prediction_label = 'churn'

Y = df_enc[prediction_label]
X = df_enc.drop(columns=[prediction_label]) 

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42, shuffle=True)   

In [None]:
print('Original dataset shape %s' % Counter(Y))

### Undersampling

In [None]:
# балансировка классов через уменьшение перепредставленного класса
msk_negative = df_enc[prediction_label] == 1
msk_positive = df_enc[prediction_label] == 0

df_negative_undersample = df_enc[msk_negative].sample(n=msk_positive.sum(), random_state=888)
df_undersample = pd.concat([df_negative_undersample, df_enc[msk_positive]])


In [None]:
uY = df_undersample[prediction_label]
uX = df_undersample.drop(columns=[prediction_label]) 

uX_train, uX_test, uy_train, uy_test = train_test_split(uX, uY, test_size=0.2, random_state=42, shuffle=True)   

In [None]:
print('Undersampled dataset shape %s' % Counter(uY))

### Oversampling

In [None]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, Y)

resX_train, resX_test, resy_train, resy_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, shuffle=True)   

In [None]:
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
def split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42, shuffle=True)
    return X_train, X_test, y_train, y_test

In [None]:
pairs = ((X, Y, "Base"), (uX, uY, "Undersampled"), (X_res, y_res, "Oversampled"))

## Models

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, fbeta_score, roc_auc_score, confusion_matrix, accuracy_score

In [None]:
for pair in pairs:
    X_train, X_test, y_train, y_test = split(pair[0], pair[1])
    lr_model = LogisticRegression(max_iter=200)
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    y_pred_train = lr_model.predict(X_train)        
    
    if False: # print train or not
        print(f"{'='*10} {pair[2]} Train {'='*10}")
        print(f"Accuracy {accuracy_score(y_train, y_pred_train)} F1: {f1_score(y_train, y_pred_train)}")
        print("Confusion Matrix \n", confusion_matrix(y_train, y_pred_train))
        print("Classification Report \n", classification_report(y_train, y_pred_train))    
        
    print(f"{'='*10} {pair[2]} Test {'='*10}")
    print(f"Accuracy {accuracy_score(y_test, y_pred)} F1: {f1_score(y_test, y_pred)}")
    print("Confusion Matrix \n", confusion_matrix(y_test, y_pred))    
    print("Classification Report \n", classification_report(y_test, y_pred))    

### Regularization

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.metrics import r2_score
from numpy import arange
from sklearn.model_selection import GridSearchCV

In [None]:
def model_cross_val_score(model, scorings):    
    for pair in pairs:    
        print(f"{'='*5} Result on {pair[2]} dataset:")
        for scoring in scorings:            
            cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)
            scores = cross_val_score(model, pair[0], pair[1], scoring=scoring, cv=cv, n_jobs=-1)
            scores = absolute(scores)                    
            print(f"{scoring}: mean {mean(scores):1.4f} std {std(scores):1.4f}")        

In [None]:
def grid_search(model_class, X, y, scoring, grid):
    model = model_class()    
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)    
    #grid['max_iter'] = arange(100, 1000, 50)    
    search = GridSearchCV(model, grid, scoring=scoring, cv=cv, n_jobs=-1)
    results = search.fit(X, y)
    print(f"Best {scoring} score: {results.best_score_:1.4f}")
    print(f"Config: {results.best_params_}")
    return results.best_params_

In [None]:
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)  # allows duplicate elements
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def grid_search_drops(model_class, X, y, scoring):
    cols_to_drop=X.columns    
    best_score = -1
    best_conf = {}    
    best_drop = []
    for i, combo in enumerate(powerset(cols_to_drop), 1):    
        if i % 50 == 0:
            print('Testing combo #{}: {}'.format(i, list(combo)))
        _X = X.drop(columns=list(combo))
        model = model_class()    
        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)    
        grid = dict()
        grid['alpha'] = arange(0.001, 1, 0.2)          
        search = GridSearchCV(model, grid, scoring=scoring, cv=cv, n_jobs=-1)
        results = search.fit(_X, y)
        if best_score < results.best_score_:
            best_score = results.best_score_
            best_conf = results.best_params_
            best_drop = list(combo)                        
    print(f"Best {scoring} score: {best_score:1.4f}")
    print(f"Config: {results.best_params_}")
    print(f"Best Drop: {best_drop}")            
    return best_conf

In [None]:
reg_grid = dict()
reg_grid['alpha'] = arange(0.001, 1, 0.05)
reg_scorings = ['neg_mean_absolute_error', 'r2']
reg_scoring = 'r2'

### L1
$ L_{reg} = L(b) +\lambda \sum \limits_{j}^{n}b_j^2. $


In [None]:
from sklearn.linear_model import Lasso

In [None]:
for pair in pairs:    
    print(f"{'*'*10} Grid Search on {pair[2]} dataset {'*'*10}")
    params = grid_search(Lasso, pair[0], pair[1], scoring=reg_scoring, grid=reg_grid)
    model_cross_val_score(Lasso(**params), scorings=reg_scorings)

### L2
$ L_{reg}  = L(b)+\lambda \sum \limits_{i}^{n} |b| $

In [None]:
from sklearn.linear_model import Ridge

In [None]:
for pair in pairs:    
    print(f"{'*'*10} Grid Search on {pair[2]} dataset {'*'*10}")
    params = grid_search(Ridge, pair[0], pair[1], scoring=reg_scoring, grid=reg_grid)
    model_cross_val_score(Ridge(**params), scorings=reg_scorings)

### ElasticNet
$ L_{reg}= L(b) +\lambda_1 \sum \limits_{i}^{n} |b|  + \lambda_2 \sum \limits_{j}^{n}b_j^2. $

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
for pair in pairs:    
    print(f"{'*'*10} Grid Search on {pair[2]} dataset {'*'*10}")
    params = grid_search(ElasticNet, pair[0], pair[1], scoring=reg_scoring, grid=reg_grid)
    model_cross_val_score(ElasticNet(**params), scorings=reg_scorings)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_grid = dict()
knn_grid['n_neighbors'] = arange(1, 50, 2)
knn_scorings = ['f1', 'accuracy']
knn_scoring = 'f1'

In [None]:
for pair in pairs:    
    print(f"{'*'*10} Grid Search on {pair[2]} dataset {'*'*10}")
    params = grid_search(KNeighborsClassifier, pair[0], pair[1], scoring=knn_scoring, grid=knn_grid)
    model_cross_val_score(KNeighborsClassifier(**params), knn_scorings)
    

### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svc_grid = dict()
svc_grid['C'] = arange(2, 6, 0.5)
svc_grid['kernel'] = ['linear'] #, 'poly', 'rbf', 'sigmoid', 'precomputed'
svc_scorings = ['f1', 'accuracy']
svc_scoring = 'f1'

In [None]:
for pair in pairs:    
    print(f"{'*'*10} Grid Search on {pair[2]} dataset {'*'*10}")
    params = grid_search(SVC, pair[0], pair[1], scoring=svc_scoring, grid=svc_grid)
    model_cross_val_score(SVC(**params), svc_scorings)
    