In [None]:
import pandas as pd
import numpy as np
import env
import wrangle
import matplotlib.pyplot as plt
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
# np.set_printoptions(suppress=True)

## Record the lesson!!

In [None]:
# data wrangling
df = wrangle.wrangle_telco()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# split the data in train, validate and test
train, test = train_test_split(df, test_size = 0.2, random_state = 123)
train, validate = train_test_split(train, test_size = 0.25, random_state = 123)

In [None]:
#check the shape
train.shape, validate.shape, test.shape

In [None]:
train.head()

### 1. Apply the scalers we talked about in this lesson to your data and visualize the results

In [None]:
# Define the thing
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the thing
scaler.fit(train[['monthly_charges']])

#transform
scaled = scaler.transform(train[['monthly_charges']])
scaled

In [None]:
#you can make a new 'scaled' column in original dataframe if you wish
    

### 2. Apply the .inverse_transform method to your scaled data. Is the resulting dataset the exact same as the original data?

In [None]:
train.head()

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()

scaled = 

### 3. Read the documentation for sklearn's QuantileTransformer. Use normal for the output_distribution and apply this scaler to your data. Visualize the result of your data scaling.

### Effect of outliers on scaling 

In [None]:
# connection function for accessing mysql 
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

query = """
select * 
from properties_2017
join predictions_2017 using(parcelid)
where transactiondate between "2017-05-01" and "2017-06-30";
"""

df = pd.read_sql(query, get_connection('zillow'))
df.head()

In [None]:
# Some prep
df = df.rename(columns={"bedroomcnt": "bedrooms", "bathroomcnt": "bathrooms", "calculatedfinishedsquarefeet": "square_feet", "taxamount": "taxes", "taxvaluedollarcnt": "tax_value"})

In [None]:
features = [
    "parcelid",
    "bedrooms",
    "bathrooms",
    "square_feet",
    "tax_value"
]

df = df[features]
df = df.set_index("parcelid")
df.head()


In [None]:
# Let's drop the nulls
df = df.dropna()
df.head()

In [None]:
# split the data in train, validate and test
train, test = train_test_split(df, test_size = 0.2, random_state = 123)
train, validate = train_test_split(train, test_size = 0.25, random_state = 123)

In [None]:
train_no_outliers = train[train.tax_value <= 2_000_000]

### Min-Max Scaler

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaled = scaler.fit_transform(train[['tax_value']])
scaled

In [None]:
scaler1 = sklearn.preprocessing.MinMaxScaler()
scaled1 = scaler.fit_transform(train_no_outliers[['tax_value']])
scaled1

In [None]:
fig = plt.figure(figsize = (18,5))

plt.subplot(131)
plt.hist(train.tax_value, bins = 30)
plt.title('Unscaled')
# plt.xlim(-1,20)


plt.subplot(132)
plt.hist(scaled, bins = 30)
plt.title('Min-Max with outliers')
# plt.xlim(-1,20)

plt.subplot(133)
plt.hist(scaled1)
plt.title('Min-Max without outliers');

### Robust Scaler

In [None]:
scaler = sklearn.preprocessing.RobustScaler()
scaled = scaler.fit_transform(train[['tax_value']])

scaler1 = sklearn.preprocessing.RobustScaler()
scaled1 = scaler.fit_transform(train_no_outliers[['tax_value']])


In [None]:
fig = plt.figure(figsize = (12,5))


plt.subplot(131)
plt.hist(train.tax_value, bins = 30)
plt.title('Unscaled')
# plt.xlim(-1,20)



plt.subplot(132)
plt.hist(scaled, bins = 500)
plt.title('Robust with outliers')
plt.xlim(-1,5)

plt.subplot(133)
plt.hist(scaled1, bins = 100)
plt.title('Robust without outliers')

;

### Quantile Transformer

In [None]:
scaler = sklearn.preprocessing.QuantileTransformer(output_distribution='normal')
scaled = scaler.fit_transform(train[['tax_value']])

scaler1 = sklearn.preprocessing.QuantileTransformer(output_distribution='normal')
scaled1 = scaler.fit_transform(train_no_outliers[['tax_value']])



In [None]:
fig = plt.figure(figsize = (12,5))


plt.subplot(131)
plt.hist(train.tax_value, bins = 30)
plt.title('Unscaled')
# plt.xlim(-1,20)


plt.subplot(132)
plt.hist(scaled, bins = 500)
plt.title('Robust with outliers')

plt.subplot(133)
plt.hist(scaled1, bins = 100)
plt.title('Robust without outliers')
;

#### Takeaways:

1. Handle outliers first (unless you know that you want to use a non-linear (e.g. Quantile Transformer)) 
scaling


2. MinMaxScaler will transform each value in the column proportionally within the desireable range (usually [0,1]). Use this as the your first choice to scale. It will preserve the shape of the distribution (no distortion).


3. StandardScaler() will transform each value in the column to range about the mean 0 and standard deviation 1,  Use StandardScaler if you know the data distribution is normal.


4. If there are outliers (which you don't want to discard), use RobustScaler(). 
    Alternatively you could remove the outliers and use either of the above 2 scalers


5. Good practice to visualize the distribution of variables after scaling (make sure the transformation you were hoping for actually happened)


6. Use non-linear scalers when you really have to (e.g.Quantiler Transformer when you must have data normally distributed).