# Initialisation

In [2]:
###############################################################################
#
#Importing libraries
#
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from numpy import random
from sklearn.compose import ColumnTransformer as CT
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import warnings
import pickle
###############################################################################
#
#Notebook options
#
pd.options.display.max_columns =100
warnings.filterwarnings('ignore')
###############################################################################
#
#Reading the data
#
df = pd.read_csv(r"../Data/Telco-Customer-Churn.csv")
df.drop(["customerID"], axis=1, inplace=True)
###############################################################################

# Pipelining

In [4]:
df.SeniorCitizen=df.SeniorCitizen.apply(lambda x: str(x))
binary_feat = df.nunique()[df.nunique() == 2].keys().tolist()
numeric_feat = [col for col in df.select_dtypes([np.float64,np.int64]).columns.tolist() if col not in binary_feat]
categorical_feat = [ col for col in df.select_dtypes('object').columns.to_list() if col not in binary_feat + numeric_feat ]
binary_feat.remove('Churn')
target=df.Churn

In [17]:
nos=["No phone service", "No"]
nos.extend( ["No internet service"]*6)
nos.extend(['Two year','Mailed check'])

In [6]:
preprocessing = CT(
                    transformers=[
                        ('numeric_scaling', MinMaxScaler(), numeric_feat),
                        ('categorical_dummies', OneHotEncoder(drop=nos), categorical_feat),
                        ('binary_binarizing', OneHotEncoder(drop='if_binary'), binary_feat)
                    ],
                    remainder='drop',
                    n_jobs=-1
                    )

In [21]:
with open(r'../binaries/preprocessing','wb') as r:
    pickle.dump(preprocessing,r) 

# Data splitting

In [22]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df,target, test_size=0.2, random_state=123456)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

train.to_csv(r'..\Data\train.csv')
test.to_csv(r'..\Data\test.csv')