# IMPORTS

## Libraries

In [1]:
import warnings

import numpy  as np
import pandas as pd

import seaborn           as sns
import matplotlib.pyplot as plt

from scipy import stats as ss

from IPython.display      import Image
from IPython.core.display import HTML

from sklearn.preprocessing   import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")

## Load Dataset

In [2]:
dfRaw = pd.read_feather('00-Data/FeatherData/df04.feather')

In [3]:
dfRaw.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,AgeGroup,Origin
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,True,True,101348.88,True,Adult,Latin
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,False,True,112542.58,False,Adult,Latin
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,True,False,113931.57,True,Adult,Latin
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,False,False,93826.63,False,Adult,Latin
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,True,True,79084.1,False,Adult,Latin


## Helper Functions

#### Jupyter Settings

In [4]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML('<style>.container { width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    seed = 0
    np.random.seed(seed)
        
    sns.set()

In [5]:
jupyter_settings()  

Populating the interactive namespace from numpy and matplotlib


#### Functions

# DATA PREPARATION

In [6]:
df05 = dfRaw.copy()

## Change Dtypes

In [7]:
df05[df05.select_dtypes(include=['bool']).columns] = df05.select_dtypes(include=['bool']).astype(int)

## Columns Filter 

In [8]:
#Categorical Attributes
toDrop = ['Surname', 'RowNumber', 'CustomerId']
df05 = df05.drop(toDrop, axis=1)

## Transformation

### Encoding

In [9]:
#AgeGroup -> One Hot Encoding
df05 = pd.get_dummies(df05, prefix=['AgeGroup'], columns=['AgeGroup'], drop_first=True)

#Geography -> One Hot Encoding
df05 = pd.get_dummies(df05, prefix=['Geography'], columns=['Geography'], drop_first=True)

# Gender
df05['Gender'] = np.where(df05['Gender'] == 'Female', 1, 0)

# origin
df05['Origin'] = np.where(df05['Origin'] == 'Latin', 1, 0)

## Data Split

In [10]:
X = df05.drop(columns=['Exited'], axis=1)
Y = df05['Exited']

# spliting into test dataset
x, XTest, y, yTest = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)
testDataset = pd.concat([XTest, yTest], axis=1).reset_index()
testDataset.to_feather('00-Data/FeatherData/testDataset.feather')

# spliting into train and valid dataset
XTrain, XValid, yTrain, yValid = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

## ~~Normalization~~

## Rescaling

In [13]:
# Numerical Attributes
numAttributes = XTrain.select_dtypes(include=['int32', 'int64', 'float64'])
numAttributes.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Origin
6908,733,1,38,5,0.0,2,1,1,1271.51,1
600,601,1,43,8,0.0,3,0,1,110916.15,1
4385,651,0,35,2,86911.8,1,1,0,174094.24,1
114,721,0,28,9,154475.54,2,0,1,101300.94,0
3536,534,0,29,7,174851.9,1,1,1,79178.31,1


In [14]:
rs = RobustScaler()
mms = MinMaxScaler()

#Balance >> Presence of well defined outiliers
XTrain['Balance'] = rs.fit_transform(XTrain[['Balance']].values)
XValid['Balance'] = rs.transform(XValid[['Balance']].values)
testDataset['Balance'] = rs.transform(testDataset[['Balance']].values)

#CreditScore
XTrain['CreditScore'] = mms.fit_transform(XTrain[['CreditScore']].values)
XValid['CreditScore'] = rs.transform(XValid[['CreditScore']].values)
testDataset['CreditScore'] = rs.transform(testDataset[['CreditScore']].values)


#Age
XTrain['Age'] = mms.fit_transform(XTrain[['Age']].values)
XValid['Age'] = rs.transform(XValid[['Age']].values)
testDataset['Age'] = rs.transform(testDataset[['Age']].values)

#Tenure
XTrain['Tenure'] = mms.fit_transform(XTrain[['Tenure']].values)
XValid['Tenure'] = rs.transform(XValid[['Tenure']].values)
testDataset['Tenure'] = rs.transform(testDataset[['Tenure']].values)

#NumOfProducts
XTrain['NumOfProducts'] = mms.fit_transform(XTrain[['NumOfProducts']].values)
XValid['NumOfProducts'] = rs.transform(XValid[['NumOfProducts']].values)
testDataset['NumOfProducts'] = rs.transform(testDataset[['NumOfProducts']].values)

#EstimatedSalary
XTrain['EstimatedSalary'] = mms.fit_transform(XTrain[['EstimatedSalary']].values)
XValid['EstimatedSalary'] = rs.transform(XValid[['EstimatedSalary']].values)
testDataset['EstimatedSalary'] = rs.transform(testDataset[['EstimatedSalary']].values)


# Convert to .feather

In [15]:
#Train Dataset
trainDataset = pd.concat([XTrain, yTrain], axis=1).reset_index()
trainDataset.to_feather('00-Data/FeatherData/trainDatasetScaling.feather')

#Valid Dataset
ValidDataset = pd.concat([XValid, yValid], axis=1).reset_index()
ValidDataset.to_feather('00-Data/FeatherData/validDatasetScaling.feather')

#Test Dataset
testDataset.to_feather('00-Data/FeatherData/testDatasetScaling.feather')