In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler


import Helpers as hlp

In [85]:
class PreprocessingTransformer(BaseEstimator, TransformerMixin):
    # def __init__(self):
    #     pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_ = hlp.Onehotencoding(X_)
        # replace missing values with mean values
        X_['customer_since_all'], _ = hlp.missingtomean(X_['customer_since_all'])
        X_['customer_since_bank'], _ = hlp.missingtomean(X_['customer_since_bank'])
        # normalizing integer values
        cols_to_norm = list(X_.select_dtypes(include=['int64', 'float64']))
        # print(fields_to_normalize)
        X_[cols_to_norm] = StandardScaler().fit_transform(X_[cols_to_norm])
        # X_[fields_to_normalize] = X_[fields_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
        # X_[integer_fields]=(X_[integer_fields]-X_[integer_fields].min())/(X_[integer_fields].max()-X_[integer_fields].min())
        return X_

In [65]:
churn_now = hlp.load_data('train_month_3_with_target.csv')
# churn_1mback = hlp.load_data('train_month_2.csv')
# churn_2mback = hlp.load_data('train_month_1.csv')

In [20]:
# Drop integers for now (until we rescale them properly)

# integers = churn_now.select_dtypes(include="int64").columns
# for i in range(len(integers)):
#     churn_now = churn_now.drop(columns = [integers[i]])

In [66]:
# Create x and y variables.

x = churn_now.drop(columns=['target'])
y = churn_now['target']

In [86]:
pipeline = Pipeline(steps=[
    ('preprocessing', PreprocessingTransformer()),
    ('knn_classifier', KNeighborsClassifier(n_neighbors=4))
])

In [68]:
# Split in train, validation and test set

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=9)
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=9)

In [69]:
# Still missing values in 2 features

x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40765 entries, 050687917679384bab2f3a5966569e13 to f142ee164d725c3722da45dbd70daff9
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   homebanking_active                 40765 non-null  bool    
 1   has_homebanking                    40765 non-null  bool    
 2   has_insurance_21                   40765 non-null  bool    
 3   has_insurance_23                   40765 non-null  bool    
 4   has_life_insurance_fixed_cap       40765 non-null  bool    
 5   has_life_insurance_decreasing_cap  40765 non-null  bool    
 6   has_fire_car_other_insurance       40765 non-null  bool    
 7   has_personal_loan                  40765 non-null  bool    
 8   has_mortgage_loan                  40765 non-null  bool    
 9   has_current_account                40765 non-null  bool    
 10  has_pension_saving                 40765 non-null  bo

In [87]:
pipeline.fit(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
Index: 40765 entries, 050687917679384bab2f3a5966569e13 to f142ee164d725c3722da45dbd70daff9
Data columns (total 72 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   homebanking_active                 40765 non-null  bool   
 1   has_homebanking                    40765 non-null  bool   
 2   has_insurance_21                   40765 non-null  bool   
 3   has_insurance_23                   40765 non-null  bool   
 4   has_life_insurance_fixed_cap       40765 non-null  bool   
 5   has_life_insurance_decreasing_cap  40765 non-null  bool   
 6   has_fire_car_other_insurance       40765 non-null  bool   
 7   has_personal_loan                  40765 non-null  bool   
 8   has_mortgage_loan                  40765 non-null  bool   
 9   has_current_account                40765 non-null  bool   
 10  has_pension_saving                 40765 non-null  bool   
 11  h



Pipeline(steps=[('preprocessing', PreprocessingTransformer()),
                ('knn_classifier', KNeighborsClassifier(n_neighbors=4))])

In [83]:
y_pred = pipeline.predict(x_validation)



In [84]:
tn, fp, fn, tp = confusion_matrix(y_validation, y_pred).ravel()
print((tn, fp, fn, tp))

(9879, 7, 306, 0)


In [26]:
precision_recall_fscore_support(y_validation, y_pred)

(array([0.97007163, 1.        ]),
 array([1.        , 0.00326797]),
 array([0.98480849, 0.00651466]),
 array([9886,  306], dtype=int64))

In [29]:
# import warnings
# warnings.filterwarnings('ignore')

# for i in range(1, 6):
#     knn = KNeighborsClassifier(n_neighbors=i)
#     knn.fit(x_train, y_train)
#     predictions = knn.predict(x_validation)
#     tn, fp, fn, tp = confusion_matrix(y_validation, predictions).ravel()
#     print((tn, fp, fn, tp))

(9638, 248, 288, 18)
(9869, 17, 306, 0)
(9850, 36, 304, 2)
(9886, 0, 306, 0)
(9881, 5, 305, 1)
