# Classification Module
## Corey Solitaire
### 9.24.2020

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats

from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# Import Data
from acquire import get_telco_data
# Transform Data
from wrangle import wrangle_telco
# Split Data
from split_scale import train_valid_test
# Scale Data
from split_scale import standard_scaler, uniform_scaler, gaussian_scaler, min_max_scaler, iqr_robust_scaler 
# Inverse Scale
from split_scale import scale_inverse

# Acquisition and Prep (testing my acquire file)

In [2]:
df = get_telco_data(cached = True)
df

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.70,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.20,54,2460.55
4,0017-IUDMW,116.80,72,8456.75
...,...,...,...,...
1690,9964-WBQDJ,24.40,71,1725.4
1691,9972-EWRJS,19.25,67,1372.9
1692,9975-GPKZU,19.75,46,856.5
1693,9993-LHIEB,67.85,67,4627.65


In [3]:
df.shape

(1695, 4)

In [4]:
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


In [5]:
df.describe()

Unnamed: 0,monthly_charges,tenure
count,1695.0,1695.0
mean,60.770413,56.735103
std,34.678865,18.209363
min,18.4,0.0
25%,24.025,48.0
50%,64.35,64.0
75%,90.45,71.0
max,118.75,72.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 53.1+ KB


In [7]:
df.isnull().sum()

customer_id        0
monthly_charges    0
tenure             0
total_charges      0
dtype: int64

### Takeaway:
1. Change total_charges to float, and replace NaN values with 0

In [8]:
df['total_charges'] = pd.to_numeric(df['total_charges'],errors='coerce')
df["total_charges"].fillna(0, inplace = True) 

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


### Added changes to warangle file, time to test if they work (Testing my wrangle File)

In [None]:
df = wrangle_telco(get_telco_data(cached = True))

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Data Visualization

In [None]:
# Histograms

# Visualize the distribution for continuous variables

plt.figure(figsize=(16, 3))

for i, col in enumerate(['monthly_charges', 'tenure', 'total_charges']):  
    plot_number = i + 1 # i starts at 0, but plot nos should start at 1
    series = df[col]  
    plt.subplot(1,4, plot_number)
    plt.title(col)
    series.hist(bins=5)

### Takaway:
1. For customers with two year contracts: Total charges is skewed to the left, with the majority of the charges between 20 - 40 dollars
2. For customers with two year contracts: Tenure is skewed to the right, with the majority of customers > 60 months
3. For customers with two year contracts: Total charges is skewed to the left, with the majority of the charges between 0 and 2000 dollars

**When it is time to split data we will need to stratify on total_charges because the distrubution of charges are NOT balanced**

In [None]:
# Boxplots

# We don't want to plot the `customer_id` column.
plt.figure(figsize=(8,4))
sns.boxplot(data=df.drop(columns=['customer_id']))

### Takaway:
1. The variation in price (0 - 8,600 dollars) is too high to make a boxplot useful

## Cutting/Binning

Lets us turn a continous variable into a categorical one by putting the numerical features in to bins
- bins =   (defines bin)
- **pd.cut** creates bins of equal width
- **pd.qcut** creates bins with same number of observations in each

In both cases the range of teh bins is derermined from teh data(but we can also specifiy it ourselves)

**This can be helpfull for initial exploration, interpretation, and visualization.**

In [None]:
# right side of the bin is inclusive
# bins of equal width

pd.cut(df.tenure, bins = 10).value_counts()

In [None]:
# right side of the bin is inclusive
# bins of equal size, different width

pd.qcut(df.tenure, 4).value_counts().sort_index()

# Splitting and Scaling Numeric Data

In [None]:
# Read in my dataframe that has been split
train, validate, test = train_valid_test(df)
print('train', train.shape)
print('validate', validate.shape)
print('test', test.shape)

In [None]:
test.head()

In [None]:
test.hist()
plt.show()

 ## Standard Scaler

In [None]:
# Scale Data

scaler1, train1, validate1, test1 = standard_scaler(train, validate, test)
print(scaler1)
print(test1.shape)
test1.head()

In [None]:
test1.hist()
plt.show()

In [None]:
# Inverse Scale

test = scale_inverse(scaler1, test1)
test

## Uniform Scaler

In [None]:
# Scale Data

scaler2, train2, validate2, test2 = uniform_scaler(train, validate, test)
print(scaler2)
print(test2.shape)
test2.head()

In [None]:
test2.hist()
plt.show()

In [None]:
# Inverse Scale

test = scale_inverse(scaler2, test2)
test

## Gaussian Scaler

In [None]:
# Scale Data

scaler3, train3, validate3, test3 = gaussian_scaler(train, validate, test)
print(scaler3)
print(test3.shape)
test3.head()

In [None]:
test3.hist()
plt.show()

In [None]:
# Inverse Scale

test = scale_inverse(scaler3, test3)
test

## Min_Max Scaler

In [None]:
# Scale Data

scaler4, train4, validate4, test4 = min_max_scaler(train, validate, test)
print(scaler4)
print(test4.shape)
test4.head()

In [None]:
test4.hist()
plt.show()

In [None]:
# Inverse Scale

test = scale_inverse(scaler4, test4)
test

## Robust Scaler

In [None]:
# Scale Data

scaler5, train5, validate5, test5 = iqr_robust_scaler(train, validate, test)
print(scaler5)
print(test5.shape)
test5.head()

In [None]:
test5.hist()
plt.show()

In [None]:
# Inverse Scale

test = scale_inverse(scaler5, test5)
test