# Dataset cleaning

## Libraries and datasets

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install xgboost
!pip install catboost
!pip install lightgbm


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb
import catboost as cat_
import lightgbm as lgb
import time


from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

## Feature engineering

Drop MPG because of the same value in each row.

In [None]:
train.drop(['MRG'], axis = 1, inplace=True)
test.drop(['MRG'], axis = 1, inplace=True)

Drop user_id from train

In [None]:
train.drop(['user_id'], axis = 1, inplace=True)
test.drop(['user_id'], axis = 1, inplace=True)

Drop top packs for the first iteration 

In [None]:
train.drop(['TOP_PACK'], axis = 1, inplace=True)
test.drop(['TOP_PACK'], axis = 1, inplace=True)

Convert tenure into the int format

In [None]:
train['TENURE'] = train['TENURE'].map({'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 
                                       'G 12-15 month':12, 'J 21-24 month': 21, 'F 9-12': 9,
                                       'E 6-9 month':6, 'D 3-6 month':3})

In [None]:
test['TENURE'] = test['TENURE'].map({'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 
                                       'G 12-15 month':12, 'J 21-24 month': 21, 'F 9-12': 9,
                                       'E 6-9 month':6, 'D 3-6 month':3})

In [None]:
train['TENURE'].fillna(1, inplace=True) # fill nans with unkown
test['TENURE'].fillna(1, inplace=True)

X / y samples

In [None]:
y = train['CHURN']
train.drop(['CHURN'], axis = 1, inplace=True)

In [None]:
X = train

Train-val split

In [None]:
X, X_val, y, y_val = train_test_split(X,y,test_size = 0.2,random_state=1)

Encoding of categorical features

In [None]:
X['REGION'].fillna('other', inplace=True) # fill nans with unkown
X_val['REGION'].fillna('other', inplace=True) 
test['REGION'].fillna('other', inplace=True)

In [None]:
encoder = LabelEncoder() 
X['REGION'] = encoder.fit_transform(X['REGION'])
X_val['REGION'] = encoder.transform(X_val['REGION'])
test['REGION'] = encoder.transform(test['REGION'])

Scaling

In [None]:
num_cols = ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2',
       'REGULARITY', 'FREQ_TOP_PACK']

In [None]:
scaler = StandardScaler()

In [None]:
X[num_cols] = scaler.fit_transform(X[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

Imputing missing values

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imp = imp.fit(X[num_cols])

In [None]:
X[num_cols] = imp.transform(X[num_cols])
X_val[num_cols] = imp.transform(X_val[num_cols])
test[num_cols] = imp.transform(test[num_cols])

## Models

In [None]:
def eval_metrics(y_test, y_pred):
    print('Precision Score: ', round(precision_score(y_val, y_pred), 3))
    print('Recall Score: ', round(recall_score(y_val, y_pred), 3))
    print('F1 Score: ', round(f1_score(y_val, y_pred), 3))
    print('Accuracy Score: ', round(accuracy_score(y_val, y_pred), 3))
    print('ROC AUC: ', round(roc_auc_score(y_val, y_pred), 3))

Submission preparation

In [None]:
submission = pd.read_csv('SampleSubmission.csv')

In [None]:
def prepare_submission(submission, test, model, name):
    sub_pred = model.predict(test)
    submission['CHURN'] = sub_pred
    return submission.to_csv(name, index=False)

### Decision tree

In [None]:
param_grid = {'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
              'ccp_alpha': [0.1, .01, .001],
              'min_samples_leaf': [3, 4, 5, 6, 7],
              'max_depth' : [5, 6, 7],
              'criterion' :['gini', 'entropy']
             }

tree_clas = DecisionTreeClassifier(random_state=124)
grid_search = HalvingGridSearchCV(estimator=tree_clas, param_grid=param_grid, scoring='roc_auc', cv=3, verbose=False)
grid_search.fit(X, y)

In [None]:
grid_search.best_params_  

In [None]:
grid_search.best_score_

In [None]:
tree_clas = DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=7,
                                   max_features=0.5, min_samples_leaf=7, random_state=124)
tree_clas.fit(X, y)

tree_ypred = tree_clas.predict(X_val)

In [None]:
eval_metrics(y_val, tree_ypred)

In [None]:
prepare_submission(submission, test, tree_clas, '1_tree_submission.csv')

### Random forest 

In [None]:
param_grid = {'n_estimators': [100, 200, 500],
              'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
              'max_depth' : [4,5,6,7,8,10],
              'criterion' :['gini', 'entropy']
             }

rand = RandomForestClassifier(random_state=124)
grid_search_rfc = HalvingGridSearchCV(estimator=rand, param_grid=param_grid, scoring='roc_auc', cv=3, verbose=False)
grid_search_rfc.fit(X, y)
grid_search_rfc.best_score_ 

In [None]:
grid_search_rfc.best_params_  

In [None]:
rf_clf = RandomForestClassifier(criterion='entropy', max_depth=8,
                                 max_features=0.5, n_estimators = 500, random_state=124)
rf_clf.fit(X, y)

rf_ypred = rf_clf.predict(X_val)
eval_metrics(y_val, rf_ypred)

In [None]:
prepare_submission(submission, test, rf_clf, '2_randforest_submission.csv')

### Gradient boosting

In [None]:
param_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.2],
              'min_samples_split': [0.05, 0.1, 0.2, 0.5],
              'min_samples_leaf': [0.05, 0.1, 0.2, 0.5],
              'max_depth':[3,5,7,8],
              'max_features':['log2','sqrt'],
              'criterion': ['friedman_mse',  'mae'],
              'subsample':[0.5, 0.6, 0.8, 0.9, 1.0],
              'n_estimators':[10, 100, 200]
             }

gb = GradientBoostingClassifier(random_state=124)
grid_search_gb = HalvingGridSearchCV(estimator=gb, param_grid=param_grid, scoring='roc_auc', cv=3, verbose=True)
grid_search_gb.fit(X, y)
grid_search_gb.best_score_ 

In [None]:
grid_search_gb.best_params_  

In [None]:
gb_clf = GradientBoostingClassifier(... random_state=124)
gb_clf.fit(X, y)

gb_ypred = gb_clf.predict(X_val)
eval_metrics(y_val, rf_ypred)

In [None]:
prepare_submission(submission, test, rf_clf, '2_randforest_submission.csv')

### Knn KNeighborsClassifier 

In [None]:
param_grid = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 10, 15], 
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'minkowski', 'manhattan', 'chebyshev']
              }


knn = KNeighborsClassifier()
grid_search_knn = HalvingGridSearchCV(estimator=knn, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=False, n_jobs=-1)
grid_search_knn.fit(X, y)
grid_search_knn.best_score_

In [None]:
grid_search_knn.best_params_  

In [None]:
knn_clf = KNeighborsClassifier(... random_state=124)
knn_clf.fit(X, y)

knn_ypred = knn_clf.predict(X_val)
eval_metrics(y_val, rf_ypred)

In [None]:
prepare_submission(submission, test, knn_clf, '3_knn_submission.csv')

### Naive bayes 

In [None]:
param_grid_nb = {'var_smoothing': np.logspace(0,-9, num=20)}

nb = GaussianNB()
grid_search_nb = HalvingGridSearchCV(estimator=nb, param_grid=param_grid_nb, scoring='roc_auc', cv=5, verbose=False, n_jobs=-1)
grid_search_nb.fit(X, y)
grid_search_nb.best_score_

In [None]:
grid_search_nb.best_params_

In [None]:
gnb_model = GaussianNB(var_smoothing = 0.00428)
gnb_model.fit(X, y)

gnb_pred = gnb_model.predict(X_val)
eval_metrics(y_val, gnb_pred)

In [None]:
prepare_submission(submission, test, gnb_model, '5_naivebayes_submission.csv')

### Logistic Regression

In [None]:
param_grid = {'solver': ['newton-cg', 'sag', 'lbfgs'],
              'penalty': ['l2', 'none'], 
              'C': np.logspace(-3,3,7)
              }

lr = LogisticRegression()
grid_search_lr = HalvingGridSearchCV(estimator=lr, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=False, n_jobs=-1)
grid_search_lr.fit(X, y)
grid_search_lr.best_score_

In [None]:
grid_search_lr.best_params_

In [None]:
lr_model =  LogisticRegression(C=0.01, penalty='l2', solver='newton-cg')
lr_model.fit(X, y)

lr_pred = lr_model.predict(X_val)
eval_metrics(y_val, lr_pred)

In [None]:
prepare_submission(submission, test, lr_model, '4_logisticregression_submission.csv')

### SVM (SVC)

Вероятно плохо сработает, так как большой датасет и классы несбалансированные

In [None]:
from sklearn.kernel_approximation import Nystroem
from sklearn import svm

In [None]:
clf_svc = svm.LinearSVC()
feature_map_nystroem = Nystroem(gamma=.2, 
                                random_state=1, n_components=300)
data_transformed = feature_map_nystroem.fit_transform(X)
clf_svc.fit(data_transformed, y)