# Classification Module
## Corey Solitaire
### 9.24.2020

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats

from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# Import Data
from acquire import get_telco_data
# Transform Data
from wrangle import wrangle_telco
# Split Data
from split_scale import train_valid_test
# Scale Data
from split_scale import standard_scaler, uniform_scaler, gaussian_scaler, min_max_scaler, iqr_robust_scaler 
# Inverse Scale
from split_scale import scale_inverse

# Acquisition and Prep (testing my acquire file)

In [None]:
df = get_telco_data(cached = True)
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Takeaway:
1. Change total_charges to float, and replace NaN values with 0

In [None]:
df['total_charges'] = pd.to_numeric(df['total_charges'],errors='coerce')
df["total_charges"].fillna(0, inplace = True) 

In [None]:
df.info()

### Added changes to warangle file, time to test if they work (Testing my wrangle File)

In [None]:
df = wrangle_telco(get_telco_data(cached = True))

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Data Visualization

In [None]:
# Histograms

# Visualize the distribution for continuous variables

plt.figure(figsize=(16, 3))

for i, col in enumerate(['monthly_charges', 'tenure', 'total_charges']):  
    plot_number = i + 1 # i starts at 0, but plot nos should start at 1
    series = df[col]  
    plt.subplot(1,4, plot_number)
    plt.title(col)
    series.hist(bins=5)

### Takaway:
1. For customers with two year contracts: Total charges is skewed to the left, with the majority of the charges between 20 - 40 dollars
2. For customers with two year contracts: Tenure is skewed to the right, with the majority of customers > 60 months
3. For customers with two year contracts: Total charges is skewed to the left, with the majority of the charges between 0 and 2000 dollars

**When it is time to split data we will need to stratify on total_charges because the distrubution of charges are NOT balanced**

In [None]:
# Boxplots

# We don't want to plot the `customer_id` column.
plt.figure(figsize=(8,4))
sns.boxplot(data=df.drop(columns=['customer_id']))

### Takaway:
1. The variation in price (0 - 8,600 dollars) is too high to make a boxplot useful

## Cutting/Binning

Lets us turn a continous variable into a categorical one by putting the numerical features in to bins
- bins =   (defines bin)
- **pd.cut** creates bins of equal width
- **pd.qcut** creates bins with same number of observations in each

In both cases the range of teh bins is derermined from teh data(but we can also specifiy it ourselves)

**This can be helpfull for initial exploration, interpretation, and visualization.**

In [None]:
# right side of the bin is inclusive
# bins of equal width

pd.cut(df.tenure, bins = 10).value_counts()

In [None]:
# right side of the bin is inclusive
# bins of equal size, different width

pd.qcut(df.tenure, 4).value_counts().sort_index()

# Splitting and Scaling Numeric Data

In [None]:
# Read in my dataframe that has been split
train, validate, test = train_valid_test(df)
print('train', train.shape)
print('validate', validate.shape)
print('test', test.shape)

In [None]:
train.head()

In [None]:
train.hist()
plt.show()

 ## Standard Scaler (Linear)

In [None]:
# Scale Data
train, validate, test = train_valid_test(df)
scaler1, train1, validate1, test1 = standard_scaler(train, validate, test)
print(scaler1)
print(train1.shape)
train1.head()

In [None]:
train1.hist()
plt.show()

In [None]:
# Inverse Scale

train = scale_inverse(scaler1, train1)
train

## Uniform Scaler (Non Linear)

In [None]:
# Scale Data
train, validate, test = train_valid_test(df)
scaler2, train2, validate2, test2 = uniform_scaler(train, validate, test)
print(scaler2)
print(train2.shape)
train2.head()

In [None]:
train2.hist()
plt.show()

In [None]:
# Inverse Scale
train, validate, test = train_valid_test(df)
train = scale_inverse(scaler2, train2)
train

## Gaussian Scaler (Non Linear)

In [None]:
# Scale Data
train, validate, test = train_valid_test(df)
scaler3, train3, validate3, test3 = gaussian_scaler(train, validate, test)
print(scaler3)
print(train3.shape)
train3.head()

In [None]:
train3.hist()
plt.show()

In [None]:
# Inverse Scale

train = scale_inverse(scaler3, train3)
train

## Min_Max Scaler (Linear)
**Should be used as default to mimmic range of boolean values (0-1)**

In [None]:
# Scale Data
train, validate, test = train_valid_test(df)
scaler4, train4, validate4, test4 = min_max_scaler(train, validate, test)
print(scaler4)
print(train4.shape)
train4.head()

In [None]:
train4.hist()
plt.show()

In [None]:
# Inverse Scale

train = scale_inverse(scaler4, train4)
train

## Robust Scaler (Linear)

In [None]:
# Scale Data
train, validate, test = train_valid_test(df)
scaler5, train5, validate5, test5 = iqr_robust_scaler(train, validate, test)
print(scaler5)
print(train5.shape)
train5.head()

In [None]:
train5.hist()
plt.show()

In [None]:
# Inverse Scale

train = scale_inverse(scaler5, train5)
train

## Takeaway:
**For this dataset, the uniform scaler provides the best output distribution**

### Setup for All Inclusive Wrangle File

In [1]:
import pandas as pd
import numpy as np
import scipy as sp 
import os
from env import host, user, password
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from wrangle import wrangle_telco, get_telco_data

In [2]:
train, validate, test = wrangle_telco(get_telco_data(cached = True))

In [3]:
train.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,total_charges_scaled,monthly_charges_scaled,tenure_scaled
1256,7501-IWUNG,73.8,61,4616.05,0.53241,0.552618,0.847222
