In [22]:
import pandas as pd
import env
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
import acquire as a
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.model_selection import train_test_split

# NEW EXERCISE QUESTIONS

## The end product of this exercise should be the specified functions in a python script named prepare.py. Do these in your classification_exercises.ipynb first, then transfer to the prepare.py file.

### This work should all be saved in your local classification-exercises repo. Then add, commit, and push your changes.


#### **Using the Iris Data:**

#### - Use the function defined in acquire.py to load the iris data.

#### - Drop the species_id and measurement_id columns.

#### - Rename the species_name column to just species.
#### - Rename the species_name column to just species
#### - Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).
#### - Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

## Use the function defined in acquire.py to load the iris data.

In [267]:
df = a.get_iris_data()

## Drop the species_id and measurement_id columns.

In [268]:
df = df.drop(columns= df[['species_id', 'measurement_id']])

## Rename the species_name column to just species.

In [269]:
df = df.rename(columns={'species_name': 'species'})


## Create dummy variables of the species name and concatenate onto the iris dataframe.

In [270]:
new_df = pd.get_dummies(df['species'])

## Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

In [272]:

df = pd.concat((df,new_df))
df

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,setosa,versicolor,virginica
0,0.0,5.1,3.5,1.4,0.2,setosa,1,0,0
1,1.0,4.9,3.0,1.4,0.2,setosa,1,0,0
2,2.0,4.7,3.2,1.3,0.2,setosa,1,0,0
3,3.0,4.6,3.1,1.5,0.2,setosa,1,0,0
4,4.0,5.0,3.6,1.4,0.2,setosa,1,0,0
...,...,...,...,...,...,...,...,...,...
145,,,,,,,0,0,1
146,,,,,,,0,0,1
147,,,,,,,0,0,1
148,,,,,,,0,0,1


In [212]:
# check for nulls in each category 
df.isnull().sum()

Unnamed: 0      0
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
setosa          0
versicolor      0
virginica       0
setosa          0
versicolor      0
virginica       0
dtype: int64

## Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [220]:
def prep_iris (df): 
    if 'measurement_id' in df.columns:
        dropcols = ['species_id','measurement_id']
    else:
        dropcols = ['species_id']
    return df.drop(
        columns=dropcols).rename(
        columns={'species_name': 'species'})

In [237]:
test = prep_iris(df)
test

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,setosa,versicolor,virginica
0,0,5.1,3.5,1.4,0.2,setosa,1,0,0
1,1,4.9,3.0,1.4,0.2,setosa,1,0,0
2,2,4.7,3.2,1.3,0.2,setosa,1,0,0
3,3,4.6,3.1,1.5,0.2,setosa,1,0,0
4,4,5.0,3.6,1.4,0.2,setosa,1,0,0
5,5,5.4,3.9,1.7,0.4,setosa,1,0,0
6,6,4.6,3.4,1.4,0.3,setosa,1,0,0
7,7,5.0,3.4,1.5,0.2,setosa,1,0,0
8,8,4.4,2.9,1.4,0.2,setosa,1,0,0
9,9,4.9,3.1,1.5,0.1,setosa,1,0,0


## **Using the Titanic dataset**
- **Use the function defined in acquire.py to load the Titanic data.**
    - Drop any unnecessary, unhelpful, or duplicated columns.
    - Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

### **Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.**

In [238]:
t_db = a.get_titanic_data()

In [None]:
t_db

In [None]:
t_db = t_db.drop(columns= ['Unnamed: 0','passenger_id'])

In [None]:
t_db

In [None]:
# using boolean masking -> which info is gone from which column?
t_db.isna().sum()[t_db.isna().sum() > 0]

In [None]:
# using boolean masking -> which info is gone from which column?
t_db.isna().sum()[t_db.isna().sum() > 0] / len(t_db)

In [None]:
# conclusions from this process:
# deck looks useless, way too many missing values
# age has almost 20% missing, we may or may not want to drop this one
# it could be valuable, but for the sake of MVP we may drop it.
# I would want to investigate more through analysis to see if these values are meaningful
# embark and embark_town still seem to be the same

In [None]:
# let's see if embark_town truly is the same as embark
(t_db['embarked'].dropna() == t_db['embark_town'].dropna().apply(
    lambda x: x[0])).mean()

In [None]:
# we can fill the null values in embark_town with the most common
# value (southhampton) b y using a fillna()
# we can reassign df['embark_town'] to this, or use an inplace=True
# NOTE!!! inplace=True changes the function to RETURN A NONETYPE
t_db.embark_town.fillna('Southampton',inplace=True)

In [None]:
t_db = t_db.drop(columns='embarked')

In [None]:
t_db.head()

In [None]:
def prep_titanic(df):
    df = df.drop(columns=['passenger_id','embarked','deck','class'])
    df['age'] = df['age'].fillna(df.age.mean())
    df['embark_town'] = df['embark_town'].fillna('Southampton')
    df = pd.concat(
    [df, pd.get_dummies(df[['sex', 'embark_town']],
                        drop_first=True)], axis=1)
    return df

In [239]:
# 20% test, 80% train_validate
# then of the 80% train_validate: 30% validate, 70% train. 

train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)


NameError: name 'train_test_split' is not defined

# **Using the Telco dataset**

> Use the function defined in acquire.py to load the Telco data.

In [225]:
df = a.get_telco_data()

> Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [226]:
#  first we will check for duplicates
df[df.duplicated()]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [227]:
# Then we will check for null values 
# this will check for null/ nan values # which produces a true values if it contains 
# a null value if it doesnt.
# since it produces a bool value we can run a .sum agg funct. to get the nummber of 
# null/ nan values we have per catagory. 
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [None]:
# So we can conclude that we do not have any blank values that would drasticly change or data

#df = df.drop(columns= ['MultipleLines', 'customerID'])


> Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.


In [None]:
# first we will visually inspect the columns to see which columns hold categorical data
df.columns
cat_cols = []

Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

Split your data

Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.

Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [None]:
def prep_telco(telco):
    telco['total_charges'] = (telco.total_charges + '0').astype(float)
    telco = telco.drop(columns=['internet_service_type_id', 'contract_type_id', 'payment_type_id'])

    telco['gender_encoded'] = telco.gender.map({'Female': 1, 'Male': 0})
    telco['partner_encoded'] = telco.partner.map({'Yes': 1, 'No': 0})
    telco['dependents_encoded'] = telco.dependents.map({'Yes': 1, 'No': 0})
    telco['phone_service_encoded'] = telco.phone_service.map({'Yes': 1, 'No': 0})
    telco['paperless_billing_encoded'] = telco.paperless_billing.map({'Yes': 1, 'No': 0})
    telco['churn_encoded'] = telco.churn.map({'Yes': 1, 'No': 0})
    
    dummy_df = pd.get_dummies(telco[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type'
                            ]],
                              drop_first=True)
    telco = pd.concat( [telco, dummy_df], axis=1 )
    
    return telco

In [242]:
# 20% test, 80% train_validate
# then of the 80% train_validate: 30% validate, 70% train. 

train, test = train_test_split(t_db, test_size=.2, random_state=123, stratify=t_db.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)


In [244]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')



train -> (498, 14)
validate -> (214, 14)
test -> (179, 14)


In [252]:
df = a.get_titanic_data()

In [250]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on survived.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.survived)
    return train, validate, test


In [258]:
#test out my function 
train, validate, test = split_data(df)

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (498, 14)
validate -> (214, 14)
test -> (179, 14)


In [261]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on survived.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.survived)
    return train, validate, test

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
561,561,561,0,3,male,40.0,0,0,7.8958,S,Third,,Southampton,1
641,641,641,1,1,female,24.0,0,0,69.3000,C,First,B,Cherbourg,1
400,400,400,1,3,male,39.0,0,0,7.9250,S,Third,,Southampton,1
498,498,498,0,1,female,25.0,1,2,151.5500,S,First,C,Southampton,0
875,875,875,1,3,female,15.0,0,0,7.2250,C,Third,,Cherbourg,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,339,339,0,1,male,45.0,0,0,35.5000,S,First,,Southampton,1
841,841,841,0,2,male,16.0,0,0,10.5000,S,Second,,Southampton,1
442,442,442,0,3,male,25.0,1,0,7.7750,S,Third,,Southampton,0
815,815,815,0,1,male,,0,0,0.0000,S,First,B,Southampton,1


In [None]:
def split_data(df):
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
    train_iris, validate_iris = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.survived)
    return train, validate, test

# Decision Tree

# Using the titanic data, in your classification-exercises repository, create a notebook, decision_tree.ipynb where you will do the following:



# What is your baseline prediction? What is your baseline accuracy? remember: 
# your baseline prediction for a classification problem is predicting the most prevelant 
# class in the training dataset (the mode).
# When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [3]:
t_db = a.get_titanic_data()

In [5]:
import prepare as p
df = p.prep_titanic(t_db)

> What is your baseline prediction? What is your baseline accuracy?

In [14]:
df = df.drop(columns = ['Unnamed: 0'])

In [15]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.000000,1,0,7.2500,Southampton,0,1,0,1
1,1,1,female,38.000000,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,26.000000,0,0,7.9250,Southampton,1,0,0,1
3,1,1,female,35.000000,1,0,53.1000,Southampton,0,0,0,1
4,0,3,male,35.000000,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,Southampton,1,1,0,1
887,1,1,female,19.000000,0,0,30.0000,Southampton,1,0,0,1
888,0,3,female,29.699118,1,2,23.4500,Southampton,0,0,0,1
889,1,1,male,26.000000,0,0,30.0000,Cherbourg,1,1,0,0


In [17]:
df['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

In [18]:
df['baseline'] = 0

In [None]:
titanic_data = df
for years_old in titanic_data['age']:
    round(years_old,2)
    print(years_old)

In [84]:
titanic_data = titanic_data.drop(columns=['sex','embark_town'] )
t_db.shape

(891, 14)

In [58]:
train_val, test = train_test_split(
titanic_data,
train_size=0.7,
random_state=1349,
stratify=titanic_data['survived']
)

train, validate = train_test_split(
train_val,
train_size = 0.3,
random_state=1349,
stratify=train_val['survived']

)

In [59]:
X_cols = train.columns.to_list()
y_cols = 'survived'
X_cols.remove('survived')



In [60]:
clf = DecisionTreeClassifier()


#what is the thing 
clf

DecisionTreeClassifier()

In [61]:
#fit the thing
clf.fit(train[X_cols], train[y_cols])

DecisionTreeClassifier()

In [72]:
model1_pred = clf.predict(train[X_cols])
model1_pred


array([1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1])

In [80]:
sns.pairplot(df, corner=True)




<seaborn.axisgrid.PairGrid at 0x7fc724411040>