In [57]:
import pandas as pd
import numpy as np

from joblib import dump, load

from sklearn.metrics import precision_score, f1_score, accuracy_score

def evaluate(y_test, y_pred):

    print('accuracy:', accuracy_score(y_test, y_pred))
    print('precision:', precision_score(y_test, y_pred))
    print('f1 score:', f1_score(y_test, y_pred))

# np.random.seed(332)
np.random.seed(566)

In [2]:
df = pd.read_csv(r'churn_prediction_data\churn_prediction_data\train.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,551,15806307,Trevisano,720,Spain,Male,38,5,114051.97,2,0,1,107577.29,0
1,6897,15709621,Martin,682,France,Female,54,4,62397.41,1,1,0,113088.6,1
2,4588,15619340,Palmer,672,France,Female,31,5,119903.67,1,1,1,132925.17,0
3,291,15620746,Napolitani,592,Spain,Female,40,4,104257.86,1,1,0,110857.33,0
4,1673,15646372,Yao,753,Spain,Male,42,5,120387.73,1,0,1,126378.57,0


In [3]:
from matplotlib import pyplot as plt

# plt.plot(df[df.Exited == 0].Gender, df[df.Exited == 0].Tenure, 'g.')
# plt.plot(df[df.Exited == 1].Gender, df[df.Exited == 1].Tenure, 'r.')

pd.crosstab(df.Gender, df.Tenure)

Tenure,2,3,4,5,6,7
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,167,760,737,1078,694,193
Male,172,888,848,1308,953,202


In [4]:
from sklearn.preprocessing import LabelEncoder

drop_feature_names = ['CustomerId', 'RowNumber', 'Surname']
df = df.drop(drop_feature_names, axis=1)

geo_le = LabelEncoder()
gen_le = LabelEncoder()
df['Geography'] = geo_le.fit_transform(df['Geography'])
df['Gender'] = gen_le.fit_transform(df['Gender'])


In [169]:
from sklearn.model_selection import train_test_split

X, y = df.iloc[:, :-1], df.iloc[:, -1]

# stratified split on lable
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

print(f'y_train ratio {y_train.mean()}')
print(f'y_test ration {y_test.mean()}')

y_train ratio 0.20402777777777778
y_test ration 0.20375


In [None]:
# Load upload file
df_upload = pd.read_csv(r'churn_prediction_data\churn_prediction_data\test.csv')

rownum = df_upload['RowNumber']
df_upload = df_upload.drop(drop_feature_names, axis=1)
df_upload['Geography'] = geo_le.transform(df_upload['Geography'])
df_upload['Gender'] = gen_le.transform(df_upload['Gender'])

In [177]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# select model
# clf = RandomForestClassifier(n_estimators = 350, class_weight={0:0.85, 1:0.15}, n_jobs=-1)
# clf = XGBClassifier()
clf = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}, random_state=6417)
# clf = KNeighborsClassifier(n_neighbors=10)
# clf = SVC()

# train and get result
clf.fit(X_train, y_train)

# show accuracy
y_pred = clf.predict(X_test)

# dump(clf, './models/stratified_lgbm.joblib')
# np.save('./outputs/stratified_lgbm.npy', clf.predict_proba(df_upload))

evaluate(y_test, y_pred)

accuracy: 0.84375
precision: 0.9318181818181818
f1 score: 0.3961352657004831


In [151]:
def trans(df):
    df['BdP'] = df.Balance / df.NumOfProducts

trans(X_train)
trans(X_test)

In [178]:
pd.crosstab(y_test, y_pred, rownames=['label'], colnames=['pred'])

pred,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,634,3
1,122,41


In [124]:
upload_pred = clf.predict(df_upload)
pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'class_weight_lgbm.csv')

In [10]:
# from sklearn.model_selection import GridSearchCV

# param_grid = [
#   {'n_estimators': [100, 200, 300, 400, 500], 'criterion': ['gini', 'entropy']},
#  ]

# clf = GridSearchCV(RandomForestClassifier(), param_grid)
# clf.fit(X_train, y_train)
# sorted(clf.cv_results_.keys())

In [11]:
(clf.predict(X_test) == y_test).mean()



0.7878787878787878

# StratifiedKFold

In [12]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)

clfs = []

for clf_type in range(2,3):

    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if clf_type == 0:
            clf = RandomForestClassifier(n_estimators = 500)
        elif clf_type == 1:
            clf = XGBClassifier(n_estimators = 300)
        else:
           clf = LGBMClassifier(max_bin=500, num_leaves=40, learning_rate=0.05)

        clf.fit(X_train, y_train)
        print((clf.predict(X_test) == y_test).mean())

        clfs.append(clf)

0.8672665916760405
0.8590176227971503
0.8555888972243061


In [13]:
pd.crosstab(clf.predict(X_test), y_test)

Exited,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2034,297
1,88,247


In [14]:
from functools import reduce

upload_prob = reduce(lambda x, y: x+y, [clf.predict_proba(df_upload) for clf in clfs])
upload_pred = np.argmax(upload_prob, axis=1)
pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'output.csv')

# Oversampling

In [89]:
minor_idxs = np.where(y_train==1)[0]
n = (y_train==0).sum()
sample_minor = np.random.choice(minor_idxs, n, replace=True)

overX, overy = np.r_[np.array(X_train)[sample_minor], X_train[y==0]], np.r_[np.ones(len(sample_minor)), np.zeros((y_train==0).sum())]

  overX, overy = np.r_[np.array(X_train)[sample_minor], X_train[y==0]], np.r_[np.ones(len(sample_minor)), np.zeros((y_train==0).sum())]


In [96]:
# clf = RandomForestClassifier()
clf = LGBMClassifier(max_bin=550, num_leaves=75, learning_rate=0.03, class_weight={0:0.8, 1:0.2})
clf.fit(overX, overy)

y_pred = clf.predict(np.array(X_test))

# dump(clf, './models/over_rfc.joblib')
# np.save('./outputs/over_rfc.npy', clf.predict_proba(df_upload))

# dump(clf, './models/over_lgbm2.joblib')
# np.save('./outputs/over_lgbm2.npy', clf.predict_proba(df_upload))

evaluate(y_test, y_pred)

accuracy: 0.8458364591147787
precision: 0.7138263665594855
f1 score: 0.519298245614035


In [17]:
upload_pred = clf.predict(np.array(df_upload))
pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'output.csv')

# Feature Combination

In [18]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,720,2,1,38,5,114051.97,2,0,1,107577.29,0
1,682,0,0,54,4,62397.41,1,1,0,113088.60,1
2,672,0,0,31,5,119903.67,1,1,1,132925.17,0
3,592,2,0,40,4,104257.86,1,1,0,110857.33,0
4,753,2,1,42,5,120387.73,1,0,1,126378.57,0
...,...,...,...,...,...,...,...,...,...,...,...
7995,568,0,0,35,6,121079.60,2,1,1,124890.50,1
7996,602,1,0,45,7,145846.07,1,1,0,99276.02,0
7997,679,2,0,43,5,132810.01,1,1,0,130780.85,1
7998,715,0,1,38,4,118729.45,1,0,0,95484.52,0


In [103]:
class Transformer():

    def __init__(self):
        self.GenderTargetDict = None
        self.BalanceParameter = None
        self.EstimatedSalaryParameter = None
        self.AgeParameter = None
        self.CreditScoreParameter = None

    def fit_transform(self, df, y):

        newdf = df.copy()
        # newdf['MemberAndCard'] = (df.IsActiveMember ^ df.HasCrCard)
        newdf['BdS'] = (df.Balance / df.EstimatedSalary)
    
        # normalize
        newdf.Balance = (df.Balance - df.Balance.mean()) / df.Balance.std()
        self.BalanceParameter = (df.Balance.mean(), df.Balance.std())
        newdf.EstimatedSalary = (df.EstimatedSalary - df.EstimatedSalary.mean()) / df.EstimatedSalary.std()
        self.EstimatedSalaryParameter = (df.EstimatedSalary.mean(), df.EstimatedSalary.std())
        newdf.Age = (df.Age - df.Age.mean()) / df.Age.std()
        self.AgeParameter = (df.Age.mean(), df.Age.std())
        newdf.CreditScore = (df.CreditScore - df.CreditScore.mean()) / df.CreditScore.std()
        self.CreditScoreParameter = (df.CreditScore.mean(), df.CreditScore.std())

        # target encoding
        target_dict = dict(zip( [_ for _ in df['Gender']], [y[df['Gender'] == _].mean() for _ in df['Gender']] ) )
        newdf['TargetGender'] = df['Gender'].apply(lambda x: target_dict[x])
        self.GenderTargetDict = target_dict

        return newdf


    def transform(self, df):

        newdf = df.copy()
        # newdf['MemberAndCard'] = (df.IsActiveMember ^ df.HasCrCard)
        newdf['BdS'] = (df.Balance / df.EstimatedSalary)
        newdf.Balance = (df.Balance - self.BalanceParameter[0]) / self.BalanceParameter[1]
        newdf.EstimatedSalary = (df.EstimatedSalary - self.EstimatedSalaryParameter[0]) / self.EstimatedSalaryParameter[1]
        newdf.Age = (df.Age - df.Age.mean()) / df.Age.std()
        
        newdf['TargetGender'] = df['Gender'].apply(lambda x: self.GenderTargetDict[x])

        return newdf


In [102]:
transformer = Transformer()

newX_train = transformer.fit_transform(X_train, y_train)
newX_test = transformer.transform(X_test)
newdf_upload = transformer.transform(df_upload)

clf = LGBMClassifier(max_bin=500, num_leaves=85, learning_rate=0.05, class_weight={0:0.8, 1:0.2})
# clf = SVC(C=1000, kernel='linear', probability=True)
# clf = SVC(C=1, kernel='rbf', probability=True)
clf.fit(newX_train, y_train)

y_pred = clf.predict(newX_test)
evaluate(y_test, y_pred)

upload_pred = clf.predict(newdf_upload)
pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'output.csv')


# dump(clf, './models/featComb_svc_linear4.joblib')
# np.save('./outputs/featComb_svc_linear4.npy', clf.predict_proba(np.array(newdf_upload)))

# dump(clf, './models/featComb_lgbm2.joblib')
# np.save('./outputs/featComb_lgbm2.npy', clf.predict_proba(np.array(newdf_upload)))

# dump(clf, './models/featComb_svc_rbf.joblib')
# np.save('./outputs/featComb_svc_rbf.npy', clf.predict_proba(np.array(newdf_upload)))

# pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'svm_output.csv')

accuracy: 0.8480870217554388
precision: 0.8527918781725888
f1 score: 0.45344129554655865


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

# Dimensional Technique

In [None]:
from sklearn.decomposition import  PCA

pcaX_train = PCA().fit_transform(X_train)
pcaX_test = PCA().fit_transform(X_test)
pcadf_upload = PCA().fit_transform(df_upload)

# clf = LGBMClassifier(max_bin=500, num_leaves=80, learning_rate=0.02)
# clf = SVC(kernel='linear')
clf = XGBClassifier(eta=0.05, max_depth=10, min_child_weight = 0.8)
clf.fit(pcaX_train, y_train)

y_pred = clf.predict(pcaX_test)
print((y_pred == y_test).mean())

upload_pred = clf.predict(np.array(pcadf_upload))
pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'output.csv')

# dump(clf, './models/pca_lgbm.joblib')
# np.save('./outputs/pac_lgbm.npy', clf.predict_proba(np.array(pcadf_upload)))

# dump(clf, './models/pca_svc_linear.joblib')
# np.save('./outputs/pac_svc_linear.npy', clf.predict_proba(np.array(pcadf_upload)))

# dump(clf, './models/pca_xgb2.joblib')
# np.save('./outputs/pac_xgb2.npy', clf.predict_proba(np.array(pcadf_upload)))

# Ensemble

In [50]:
import os

emptyArr = np.zeros((2000, 2))

for fn in os.listdir('./outputs'):

    fp = os.path.join('outputs', fn)
    print(np.load(fp).shape)

    # if 'svc' in fn:
    #     w = 0
    # elif 'lgbm' in fn:
    #     w = 1.5
    # else:
        # w = 1

    w = 1

    emptyArr += w * np.load(fp)

emptyArr[:, 1] *= 1.5

upload_pred = np.argmax(emptyArr, axis=1)
pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'ensemble_output.csv')

(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)


In [51]:
(upload_pred == 0).sum()

1680

In [52]:
(upload_pred==1).sum()

320