In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix

In [3]:
df = pd.read_csv('churn_data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
df.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df = pd.get_dummies(df, columns=['Geography', 'Gender'])
df.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,1,15634602,Hargrave,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,2,15647311,Hill,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,3,15619304,Onio,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,4,15701354,Boni,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,5,15737888,Mitchell,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [7]:
scaler = StandardScaler()
cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
scaled = scaler.fit_transform(df[cols])
df[cols] = scaled
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,-4.824585e-16,2.318146e-16,-1.078249e-16,-6.252776000000001e-17,1.634248e-17,0.7055,0.5151,-2.8776980000000004e-17,0.2037,0.5014,0.2509,0.2477,0.4543,0.5457
std,2886.89568,71936.19,1.00005,1.00005,1.00005,1.00005,1.00005,0.45584,0.499797,1.00005,0.402769,0.500023,0.433553,0.431698,0.497932,0.497932
min,1.0,15565700.0,-3.109504,-1.994969,-1.733315,-1.225848,-0.9115835,0.0,0.0,-1.740268,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,15628530.0,-0.6883586,-0.6600185,-0.6959818,-1.225848,-0.9115835,0.0,0.0,-0.8535935,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,15690740.0,0.01522218,-0.1832505,-0.004425957,0.3319639,-0.9115835,1.0,1.0,0.001802807,0.0,1.0,0.0,0.0,0.0,1.0
75%,7500.25,15753230.0,0.6981094,0.4842246,0.6871299,0.8199205,0.8077366,1.0,1.0,0.8572431,0.0,1.0,1.0,0.0,1.0,1.0
max,10000.0,15815690.0,2.063884,5.061197,1.724464,2.795323,4.246377,1.0,1.0,1.7372,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
def get_metrics(y_test, y_score, b=1):
    metrics = {}
    
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
    ix = np.argmax(fscore)
    
    metrics['threshold'] = thresholds[ix]
    metrics['fscore'] = fscore[ix]
    metrics['precision'] = precision[ix]
    metrics['recall'] = recall[ix]
    metrics['roc_auc'] = roc_auc_score(y_test, y_score)
    return metrics

def run_classifier(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_score = clf.predict_proba(X_test)[:, 1]
    return y_score, get_metrics(y_test, y_score)

In [9]:
drop_cols = ['Surname', 'RowNumber', 'CustomerId', 'Exited']
X_train, X_test, y_train, y_test = train_test_split(df.drop(drop_cols, axis=1),
                                                    df['Exited'], random_state=0)

In [10]:
clfs = {}
clfs['LogisticRegression'] = LogisticRegression(random_state=42)
clfs['RandomForestClassifier'] = RandomForestClassifier(max_depth=7, random_state=42)
clfs['KNeighborsClassifier'] = KNeighborsClassifier(n_neighbors=10)
clfs['GradientBoostingClassifier'] = GradientBoostingClassifier(n_estimators=300, max_depth=7)

results = []
max_roc_auc = 0
for name, clf in clfs.items():
    proba, metrics = run_classifier(clf, X_train, X_test, y_train, y_test)
    
    d = {'classifier': name}
    d.update(metrics)
    results.append(pd.DataFrame(d, index=[0]))
    
    if metrics['roc_auc'] > max_roc_auc:
        max_roc_auc = metrics['roc_auc']
        best_proba = proba
        best_metrics = metrics
    
pd.concat(results, ignore_index=True).sort_values(by='roc_auc', ascending=False)

Unnamed: 0,classifier,threshold,fscore,precision,recall,roc_auc
1,RandomForestClassifier,0.30867,0.636719,0.63301,0.640472,0.865019
3,GradientBoostingClassifier,0.482754,0.621622,0.728232,0.54224,0.853293
2,KNeighborsClassifier,0.4,0.588123,0.573832,0.603143,0.820844
0,LogisticRegression,0.254466,0.505529,0.422721,0.628684,0.772029


In [11]:
cnf_matrix = confusion_matrix(y_test, best_proba > best_metrics['threshold'])
TP = cnf_matrix[1][1]
FP = cnf_matrix[0][1]

# сделать оценку экономической эффективности (1 доллар на привлечение, 2 доллара - с каждого правильно классифицированного удержанного)
2*TP - 1*(TP + FP)

136