In [1]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as skl
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import wrangle

In [2]:
df = wrangle.wrangle_telco()
df

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.70,71,7904.25
1,0014-BMAQU,84.65,63,5377.80
2,0016-QLJIS,90.45,65,5957.90
3,0017-DINOC,45.20,54,2460.55
4,0017-IUDMW,116.80,72,8456.75
...,...,...,...,...
1690,9964-WBQDJ,24.40,71,1725.40
1691,9972-EWRJS,19.25,67,1372.90
1692,9975-GPKZU,19.75,46,856.50
1693,9993-LHIEB,67.85,67,4627.65


In [3]:
def split_my_data(X, y, train_pct):
    X_train, X_test, y_train, y_test = (
        train_test_split(X, y, train_size = train_pct,
                             random_state=42))
    return X_train, X_test, y_train, y_test

In [4]:
X = df[['monthly_charges', 'tenure']]
y = df['total_charges']
train_pct = .8

X_train, X_test, y_train, y_test = split_my_data(X, y, train_pct)
X_train

Unnamed: 0,monthly_charges,tenure
700,66.25,52
1341,90.95,66
629,93.20,71
1034,80.60,70
1447,20.05,54
...,...,...
1130,105.05,67
1294,20.35,54
860,19.85,41
1459,25.40,12


In [5]:
def standard_scaler(train, test):
    scaler = skl.StandardScaler().fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), 
                    columns=train.columns, index=train.index)

    test_scaled = pd.DataFrame(scaler.transform(test), 
                    columns=test.columns, index=test.index)
    
    return scaler, train_scaled, test_scaled

In [6]:
scaler, train_scaled, test_scaled = standard_scaler(X_train, X_test)
train_scaled

Unnamed: 0,monthly_charges,tenure
700,0.180357,-0.253416
1341,0.889548,0.518047
629,0.954151,0.793570
1034,0.592377,0.738465
1447,-1.146148,-0.143207
...,...,...
1130,1.294391,0.573152
1294,-1.137534,-0.143207
860,-1.151890,-0.859565
1459,-0.992538,-2.457596


In [7]:
def scale_inverse(scaler, train_scaled, test_scaled):
    train = pd.DataFrame(scaler.inverse_transform(train_scaled),
        columns = train_scaled.columns, index = train_scaled.index)
    
    test = pd.DataFrame(scaler.inverse_transform(test_scaled),
        columns = test_scaled.columns, index = test_scaled.index)
    
    return train, test

In [8]:
scaler, train, test = scale_inverse(X_train, X_test)
test

Unnamed: 0,monthly_charges,tenure
931,73.35,71.0
1398,25.00,39.0
1561,90.50,47.0
1006,105.35,69.0
506,25.80,68.0
...,...,...
1170,25.20,72.0
1128,85.95,70.0
350,65.15,71.0
887,61.45,37.0


In [9]:
def uniform_scaler(train, test):
    scaler = skl.QuantileTransformer(n_quantiles=100,
                                output_distribution='uniform')
    scaler.fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), 
                    columns=train.columns, index=train.index)
    
    test_scaled = pd.DataFrame(scaler.transform(test), 
                    columns=test.columns, index=test.index)
    
    return scaler, train_scaled, test_scaled

In [10]:
scaler, train_scaled, test_scaled = uniform_scaler(X_train, X_test)
test_scaled

Unnamed: 0,monthly_charges,tenure
931,0.573077,0.762626
1398,0.313131,0.166667
1561,0.750107,0.237374
1006,0.864304,0.651515
506,0.375849,0.611111
...,...,...
1170,0.333333,1.000000
1128,0.707406,0.696970
350,0.521513,0.762626
887,0.497274,0.151515


In [11]:
def gaussian_scaler(train, test):
    scaler = skl.PowerTransformer(method='yeo-johnson')
    scaler.fit(train)
    
    train_scaled = pd.DataFrame(scaler.transform(train), 
                    columns=train.columns, index=train.index)
    
    test_scaled = pd.DataFrame(scaler.transform(test), 
                    columns=test.columns, index=test.index)
    
    return scaler, train_scaled, test_scaled

In [12]:
scaler, train_scaled, test_scaled = gaussian_scaler(X_train, X_test)
test_scaled

Unnamed: 0,monthly_charges,tenure
931,0.542697,0.927528
1398,-0.992661,-1.239274
1561,0.907740,-0.835478
1006,1.186868,0.747848
506,-0.954836,0.660292
...,...,...
1170,-0.983129,1.019660
1128,0.815909,0.836925
350,0.346981,0.927528
887,0.253081,-1.326531


In [13]:
def min_max_scaler(train, test):
    scaler = skl.MinMaxScaler().fit(train)
    
    train_scaled = pd.DataFrame(scaler.transform(train), 
                    columns=train.columns, index=train.index)
    
    test_scaled = pd.DataFrame(scaler.transform(test), 
                    columns=test.columns, index=test.index)
    
    return scaler, train_scaled, test_scaled

In [14]:
scaler, train_scaled, test_scaled = min_max_scaler(X_train, X_test)
test_scaled

Unnamed: 0,monthly_charges,tenure
931,0.547583,0.986111
1398,0.065770,0.541667
1561,0.718485,0.652778
1006,0.866467,0.958333
506,0.073742,0.944444
...,...,...
1170,0.067763,1.000000
1128,0.673144,0.972222
350,0.465869,0.986111
887,0.428999,0.513889


In [15]:
def iqr_robust_scaler(train, test):
    scaler = skl.RobustScaler(quantile_range=(25.0,75.0))
    scaler.fit(train)
    
    train_scaled = pd.DataFrame(scaler.transform(train), 
                    columns=train.columns, index=train.index)
    
    test_scaled = pd.DataFrame(scaler.transform(test), 
                    columns=test.columns, index=test.index)
    
    return scaler, train_scaled, test_scaled

In [16]:
scaler,train_scaled,test_scaled = iqr_robust_scaler(X_train,X_test)
test_scaled

Unnamed: 0,monthly_charges,tenure
931,0.179531,0.304348
1398,-0.546854,-1.086957
1561,0.437183,-0.739130
1006,0.660282,0.217391
506,-0.534836,0.173913
...,...,...
1170,-0.543850,0.347826
1128,0.368826,0.260870
350,0.056338,0.304348
887,0.000751,-1.173913
