In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

from wrangle import wrangle_telco
import split_scale

# Read in Telco df using wrangle_telco() function

**<font color=purple>Ok, so look at the Telco dataframe. See that object column? Yeah, ask me about my rabbit hole trying to be super specific with this exact DataFrame instead of focussing on the broader use case for a function like this. Just because you CAN do something doesn't always mean you should waste your time doing it. ; )</font>**

In [2]:
df = wrangle_telco()

In [3]:
df.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1695 non-null object
monthly_charges    1695 non-null float64
tenure             1695 non-null int64
total_charges      1695 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


# Split and Scale data using split_my_data() Function

- A little function that does one thing! Why?


- You will not always need to scale your data! You may not want to split your data in the same way to feed into your scaling functions. **Baby functions can be your friends.**

In [5]:
def split_my_data(df, train_pct=0.70, seed=123):
    train, test = train_test_split(df, train_size=train_pct, random_state=seed)
    return train, test

In [6]:
train, test = split_my_data(df)

In [7]:
train.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
1469,8661-BOYNW,84.4,72,6096.45
163,0960-HUWBM,104.1,65,6700.05
392,2346-LOCWC,20.5,58,1191.4
1546,9114-DPSIA,81.0,72,5750.0
797,4891-NLUBA,61.45,61,3751.15


In [8]:
test.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
252,1494-EJZDW,20.15,10,220.8
632,3795-GWTRD,75.55,63,4707.85
472,2900-PHPLN,19.55,70,1462.05
1029,6211-WWLTF,99.7,63,6330.4
910,5494-WOZRZ,82.0,71,5999.85


In [9]:
# I should have the same number of columns, 4, and different number of rows

print(train.shape)
print(test.shape)

(1186, 4)
(509, 4)


# Create a standard_scaler function

- After splitting your df into train and test, decide on the values that will be your independent variables, X, and dependent variable, y.


- Create your X_train, X_test, y_train, y_test dfs.


- We will only scale the X_train and X_test right now, but you could feed the y_train and Y-test through these functions if you wanted to.

In [10]:
X_train = train[['tenure']]
X_test = test[['tenure']]
y_train = [['total_charges']]
y_test = [['total_charges']]

In [11]:
X_train.head()

Unnamed: 0,tenure
1469,72
163,65
392,58
1546,72
797,61


In [12]:
X_test.head()

Unnamed: 0,tenure
252,10
632,63
472,70
1029,63
910,71


In [13]:
def standard_scaler(X_train, X_test):
    """
    Takes in X_train and X_test dfs with numeric values only
    Returns scaler, X_train_scaled, X_test_scaled dfs
    """
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [14]:
# test the function

scaler, X_train_scaled, X_test_scaled = split_scale.standard_scaler(X_train, X_test)

In [15]:
# validate that it can return the scaler if I need to revert to unscaled

scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
# validate X_train_scaled

X_train_scaled.head()

Unnamed: 0,tenure
1469,0.838946
163,0.441745
392,0.044543
1546,0.838946
797,0.214772


In [17]:
# validate X_test_scaled

X_test_scaled.head()

Unnamed: 0,tenure
252,-2.679126
632,0.328258
472,0.72546
1029,0.328258
910,0.782203


### Pretty, pretty scaled data...

# Validate my standard_scaler() function in validation notebook

### Could I attach my X_train_scaled df to my original train df?

In [18]:
train_all = train.merge(X_train_scaled, how='inner', on=None, left_index=True, right_index=True)
train_all.columns

Index(['customer_id', 'monthly_charges', 'tenure_x', 'total_charges',
       'tenure_y'],
      dtype='object')

In [19]:
train_all.head()

Unnamed: 0,customer_id,monthly_charges,tenure_x,total_charges,tenure_y
1469,8661-BOYNW,84.4,72,6096.45,0.838946
163,0960-HUWBM,104.1,65,6700.05,0.441745
392,2346-LOCWC,20.5,58,1191.4,0.044543
1546,9114-DPSIA,81.0,72,5750.0,0.838946
797,4891-NLUBA,61.45,61,3751.15,0.214772


In [20]:
train_all.columns = ['customer_id', 'monthly_charges', 'tenure', 'total_charges',
       'tenure_scaled']

In [21]:
train_all.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,tenure_scaled
1469,8661-BOYNW,84.4,72,6096.45,0.838946
163,0960-HUWBM,104.1,65,6700.05,0.441745
392,2346-LOCWC,20.5,58,1191.4,0.044543
1546,9114-DPSIA,81.0,72,5750.0,0.838946
797,4891-NLUBA,61.45,61,3751.15,0.214772


### Looks like I can by merging my train and X_train_scaled on their shared index. Good to know.

# Yes!! Now let's do the inverse!

In [22]:
def scale_inverse(scaler, X_train_scaled, X_test_scaled):
    """Takes in the scaler and X_train_scaled and X_test_scaled dfs
       and returns the X_train and X_test dfs
       in their original forms before scaling
    """
    X_train_unscaled = (pd.DataFrame(scaler.inverse_transform(X_train_scaled), 
                      columns=X_train_scaled.columns, 
                      index=X_train_scaled.index))
    X_test_unscaled = (pd.DataFrame(scaler.inverse_transform(X_test_scaled), 
                     columns=X_test_scaled.columns,
                     index=X_test_scaled.index))
    return X_train_unscaled, X_test_unscaled

In [23]:
X_train_unscaled, X_test_unscaled = scale_inverse(scaler, X_train_scaled, X_test_scaled)

In [24]:
# Validate our DFs are unscaled

X_train_unscaled.head()

Unnamed: 0,tenure
1469,72.0
163,65.0
392,58.0
1546,72.0
797,61.0


In [25]:
X_test_unscaled.head()

Unnamed: 0,tenure
252,10.0
632,63.0
472,70.0
1029,63.0
910,71.0


## Validate that my scale_inverse() function works in the validation notebook

# Create and test uniform_scaler() function

In [26]:
def uniform_scaler(X_train, X_test):
    """Quantile transformer, non_linear transformation - uniform.
       Reduces the impact of outliers, smooths out unusual distributions.
       Takes in a X_train and X_test dfs
       Returns the scaler, X_train_scaled, X_test_scaled
    """
    scaler = (QuantileTransformer(n_quantiles=100, 
                                  output_distribution='uniform', 
                                  random_state=123, copy=True)
                                  .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [27]:
scaler, X_train_scaled, X_test_scaled = uniform_scaler(X_train, X_test)

In [28]:
scaler

QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=100,
                    output_distribution='uniform', random_state=123,
                    subsample=100000)

In [29]:
X_train_scaled.head()

Unnamed: 0,tenure
1469,1.0
163,0.510101
392,0.373737
1546,1.0
797,0.424242


In [30]:
X_test_scaled.head()

Unnamed: 0,tenure
252,0.025253
632,0.469697
472,0.691919
1029,0.469697
910,0.757576


## Validate my uniform_scaler() function in my Validation notebook

# Create and test the gaussian_scaler() function

In [31]:
X_train.head(1)

Unnamed: 0,tenure
1469,72


In [32]:
X_test.head(1)

Unnamed: 0,tenure
252,10


In [33]:
def gaussian_scaler(X_train, X_test):
    """Transforms and then normalizes data.
       Takes in X_train and X_test dfs, 
       yeo_johnson allows for negative data,
       box_cox allows positive data only.
       Returns Zero_mean, unit variance normalized X_train_scaled and X_test_scaled and scaler.
    """
    scaler = (PowerTransformer(method='yeo-johnson', 
                               standardize=False, 
                               copy=True)
                              .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [34]:
scaler, X_train_scaled, X_test_scaled = gaussian_scaler(X_train, X_test)

In [35]:
scaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=False)

In [36]:
X_train_scaled.head()

Unnamed: 0,tenure
1469,9950.70254
163,7856.438822
392,6040.584127
1546,9950.70254
797,6785.375569


In [37]:
X_test_scaled.head()

Unnamed: 0,tenure
252,117.379965
632,7309.661082
472,9323.346607
1029,7309.661082
910,9634.096705


## Validate my gaussian_scaler() function in my Validation notebook

# Create and test the min_max_scaler() function

In [38]:
X_train.head(1)

Unnamed: 0,tenure
1469,72


In [39]:
def min_max_scaler(X_train, X_test):
    """Transforms features by scaling each feature to a given range.
       Takes in X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled within range.
       Sensitive to outliers.
    """
    scaler = (MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [40]:
scaler, X_train_scaled, X_test_scaled = min_max_scaler(X_train, X_test)

In [41]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [42]:
X_train_scaled.head()

Unnamed: 0,tenure
1469,1.0
163,0.901408
392,0.802817
1546,1.0
797,0.84507


In [43]:
X_test_scaled.head()

Unnamed: 0,tenure
252,0.126761
632,0.873239
472,0.971831
1029,0.873239
910,0.985915


## Validate my min_max_scaler() function in my Validation notebook

# Create and test the iqr_robust_scaler() function

In [44]:
X_train.head(1)

Unnamed: 0,tenure
1469,72


In [45]:
def iqr_robust_scaler(X_train, X_test):
    """Scales features using stats that are robust to outliers
       by removing the median and scaling data to the IQR.
       Takes in a X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled.
    """
    scaler = (RobustScaler(quantile_range=(25.0,75.0), 
                           copy=True, 
                           with_centering=True, 
                           with_scaling=True)
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [46]:
scaler, X_train_scaled, X_test_scaled = iqr_robust_scaler(X_train, X_test)

In [47]:
scaler

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

In [48]:
X_train_scaled.head()

Unnamed: 0,tenure
1469,0.363636
163,0.045455
392,-0.272727
1546,0.363636
797,-0.136364


In [49]:
X_test_scaled.head()

Unnamed: 0,tenure
252,-2.454545
632,-0.045455
472,0.272727
1029,-0.045455
910,0.318182


## Validate my iqr_robust_scaler() function in my Validation notebook

# Boom! Ready for Data Science!

- Want more detailed information about how each scaler works and the best use cases for each? Here's the link to an article I found useful!

https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02