# Import LIBS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Test_Data = Test

In [93]:
# Import Data
test_data = pd.read_csv("C:/Users/bruno/projeto_churn_predict/files/test_data.csv")

In [94]:
# Check
test_data

Unnamed: 0,CustomerID,Surname,NetworkScore,Region,Gender,Age,Tenure,MonthlyCharge,NumOfProducts,HasInternetService,IsActiveMember,EstimatedMonthlyUsage,Exited
0,5081,Smith,31.0,South,Other,24.0,10,141.98,3,1,0.0,543.23,
1,3273,Walters,34.0,East,Other,22.0,19,40.45,3,1,0.0,,
2,3565,Vaughan,85.0,East,Other,48.0,20,54.95,3,0,1.0,312.02,
3,2735,Miller,,South,Female,69.0,4,97.21,1,1,1.0,,
4,2163,Macias,27.0,West,Male,59.0,10,101.76,2,1,0.0,864.17,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,2588,Moore,52.0,South,Female,54.0,14,28.87,4,1,0.0,729.91,
1976,3324,George,84.0,East,Male,35.0,16,83.13,4,0,0.0,622.73,
1977,1901,Smith,64.0,East,Female,24.0,10,44.44,3,1,1.0,533.30,
1978,4629,Horne,82.0,West,Female,38.0,14,167.51,1,1,1.0,917.44,


In [95]:
# Check
test_data.describe()

Unnamed: 0,CustomerID,NetworkScore,Age,Tenure,MonthlyCharge,NumOfProducts,HasInternetService,IsActiveMember,EstimatedMonthlyUsage,Exited
count,1980.0,1879.0,1888.0,1980.0,1980.0,1980.0,1980.0,1878.0,1877.0,0.0
mean,3031.292929,50.031932,43.674788,9.843939,109.61847,2.476263,0.494444,0.495208,537.918151,
std,1736.890327,28.539792,15.366367,6.000791,51.256648,1.118516,0.500095,0.50011,260.774018,
min,1.0,1.0,18.0,0.0,20.1,1.0,0.0,0.0,100.65,
25%,1524.5,26.0,30.0,5.0,66.12,1.0,0.0,0.0,311.81,
50%,3023.5,50.0,43.0,10.0,108.1,2.0,0.0,0.0,535.33,
75%,4534.5,75.0,57.0,15.0,154.03,3.0,1.0,1.0,764.76,
max,5999.0,100.0,70.0,20.0,199.88,4.0,1.0,1.0,999.99,


In [96]:
# Check
test_data.isnull().sum()

CustomerID                  0
Surname                     0
NetworkScore              101
Region                      0
Gender                      0
Age                        92
Tenure                      0
MonthlyCharge               0
NumOfProducts               0
HasInternetService          0
IsActiveMember            102
EstimatedMonthlyUsage     103
Exited                   1980
dtype: int64

In [80]:
# Fill null values with mean
test_data['NetworkScore'].fillna(test_data['NetworkScore'].mean(), inplace = True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)
test_data['EstimatedMonthlyUsage'].fillna(test_data['EstimatedMonthlyUsage'].mean(), inplace = True)

In [81]:
# Check
test_data.isnull().sum()

CustomerID                  0
Surname                     0
NetworkScore              101
Region                      0
Gender                      0
Age                        92
Tenure                      0
MonthlyCharge               0
NumOfProducts               0
HasInternetService          0
IsActiveMember            102
EstimatedMonthlyUsage     103
Exited                   1980
dtype: int64

In [82]:
# Drop null lines
test_data.dropna(subset=['IsActiveMember'], inplace=True)

In [83]:
# Drop unwanted columns
test_data = test_data.drop(['HasInternetService'], axis=1)

In [84]:
# Round and convert types
test_data['NetworkScore'] = np.ceil(test_data['NetworkScore']).astype(int)
test_data['Age'] = np.ceil(test_data['Age']).astype(int)
test_data['MonthlyCharge'] = np.ceil(test_data['MonthlyCharge']).astype(int)
test_data['EstimatedMonthlyUsage'] = np.ceil(test_data['EstimatedMonthlyUsage']).astype(int)
test_data['IsActiveMember'] = np.ceil(test_data['IsActiveMember']).astype(int)

In [86]:
test_data

Unnamed: 0,CustomerID,Surname,NetworkScore,Region,Gender,Age,Tenure,MonthlyCharge,NumOfProducts,IsActiveMember,EstimatedMonthlyUsage,Exited
0,5081,Smith,31,South,Other,24,10,142,3,0,544,
1,3273,Walters,34,East,Other,22,19,41,3,0,538,
2,3565,Vaughan,85,East,Other,48,20,55,3,1,313,
3,2735,Miller,51,South,Female,69,4,98,1,1,538,
4,2163,Macias,27,West,Male,59,10,102,2,0,865,
...,...,...,...,...,...,...,...,...,...,...,...,...
1975,2588,Moore,52,South,Female,54,14,29,4,0,730,
1976,3324,George,84,East,Male,35,16,84,4,0,623,
1977,1901,Smith,64,East,Female,24,10,45,3,1,534,
1978,4629,Horne,82,West,Female,38,14,168,1,1,918,


In [101]:
# Select the values
X_test = test_data.iloc[:, 1:11].values

In [102]:
X_test

array([['Smith', 31.0, 'South', ..., 3, 1, 0.0],
       ['Walters', 34.0, 'East', ..., 3, 1, 0.0],
       ['Vaughan', 85.0, 'East', ..., 3, 0, 1.0],
       ...,
       ['Smith', 64.0, 'East', ..., 3, 1, 1.0],
       ['Horne', 82.0, 'West', ..., 1, 1, 1.0],
       ['Everett', 95.0, 'South', ..., 1, 0, 0.0]], dtype=object)

In [103]:
# Select the values
Y_test = test_data.iloc[:, 11].values

In [104]:
Y_test

array([543.23,    nan, 312.02, ..., 533.3 , 917.44,    nan])

In [105]:
# Instance
label_encoder_surname = LabelEncoder()
label_encoder_gender = LabelEncoder()
label_encoder_region = LabelEncoder()

In [106]:
if isinstance(X_test, np.ndarray):
    X_test = pd.DataFrame(X_test) 

X_test.iloc[:, 0] = label_encoder_surname.fit_transform(X_test.iloc[:, 0])
X_test.iloc[:, 2] = label_encoder_region.fit_transform(X_test.iloc[:, 2])
X_test.iloc[:, 3] = label_encoder_gender.fit_transform(X_test.iloc[:, 3])

In [107]:
scaler_train = StandardScaler()
X_test = scaler_train.fit_transform(X_test)

In [108]:
X_test

array([[ 1.1302389 , -0.66703352,  0.4174068 , ...,  0.46836128,
         1.01117353, -0.99046083],
       [ 1.44594258, -0.56188915, -1.35422335, ...,  0.46836128,
         1.01117353, -0.99046083],
       [ 1.37578621,  1.22556522, -1.35422335, ...,  0.46836128,
        -0.98894994,  1.00963104],
       ...,
       [ 1.1302389 ,  0.4895546 , -1.35422335, ...,  0.46836128,
         1.01117353,  1.00963104],
       [-0.3129779 ,  1.12042084,  1.30322187, ..., -1.3201736 ,
         1.01117353,  1.00963104],
       [-0.90930707,  1.57604647,  0.4174068 , ..., -1.3201736 ,
        -0.98894994, -0.99046083]])

# Salvar variáveis

In [None]:
# import pickle

In [None]:
# with open('customer.pkl', mode = 'wb') as f:
  # pickle.dump([X_credit_treinamento, Y_credit_treinamento, X_credit_teste, Y_credit_teste], f)