In [2]:
import numpy as np
import pandas as pd
# grab my custom stuff:
import acquire
# viz:
import matplotlib.pyplot as plt

In [3]:
# grab all three datasets:
# iris
iris = acquire.get_iris_data()
# titanic
titanic = acquire.get_titanic_data()
# telco
telco = acquire.get_telco_data()

Using the Iris Data:
    Use the function defined in acquire.py to load the iris data.

In [None]:
# done and done :)

Drop the species_id and measurement_id columns.

In [4]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species_id    150 non-null    int64  
 1   species_name  150 non-null    object 
 2   sepal_length  150 non-null    float64
 3   sepal_width   150 non-null    float64
 4   petal_length  150 non-null    float64
 5   petal_width   150 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [5]:
# I immediately saw measurement_id as redundant and 
# indicitive of a repeated index column,
# so I didnt select in the first place
# If you did a SELECT *, however, you would want to 
# drop that out at this point

In [6]:
# However, I still have a species_id

In [8]:
# we can observe with a crosstab that
# we have a 1:1 relationship
# between species_id and species_name
# species_name is the more useful of these two
# so i'm going to keep that
pd.crosstab(iris.species_id, iris.species_name)

species_name,setosa,versicolor,virginica
species_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50,0,0
2,0,50,0
3,0,0,50


Species is the thing that I'm going to be predicting in the iris case.  When I'm talking about a target, it may or may not need to be encoded depending on the type of algorithm that I'm using

In [9]:
# reassign iris to the version of iris that does not
# include species id
iris = iris.drop(columns='species_id')

Rename the species_name column to just species.

In [12]:
# rename the column of species name to species
# reassign that value into iris
iris = iris.rename(columns={'species_name': 'species'})

    Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

In [16]:
# note: one-hot encoding not the best for a target variable
# sometimes they need encoding, but we would generally want
# to keep them in a single series.
# for the sake of the exercise:
pd.concat([iris,
 pd.get_dummies(iris['species'], drop_first=True)],
         axis=1)

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,versicolor,virginica
0,setosa,5.1,3.5,1.4,0.2,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0
2,setosa,4.7,3.2,1.3,0.2,0,0
3,setosa,4.6,3.1,1.5,0.2,0,0
4,setosa,5.0,3.6,1.4,0.2,0,0
...,...,...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3,0,1
146,virginica,6.3,2.5,5.0,1.9,0,1
147,virginica,6.5,3.0,5.2,2.0,0,1
148,virginica,6.2,3.4,5.4,2.3,0,1


    Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [17]:
def prep_iris(iris):
    '''
    prep_iris will take in a single pandas DataFrame
    that will presumably match the columns and shape
    that we expect from our acquire module's get_iris_data()
    functional return
    
    args: iris, a single pandas dataframe
    return: a cleaned single iris dataframe
    '''
    if 'measurement_id' in iris.columns:
        dropcols = ['species_id','measurement_id']
    else:
        dropcols = ['species_id']
    return iris.drop(
        columns=dropcols).rename(
        columns={'species_name': 'species'})

In [19]:
# prep_iris(some acquired uncleaned iris dataset)

In [26]:
def wrangle_iris():
    return prep_iris(acquire.get_iris_data())

In [31]:
# inside a wrangle.py:
# we could make a dictionary
# that holds the values of each
# wrangle function call we construct
# with an associated string value
# This allows us to further modularize our
# scripts without needing to call separate functions!
my_functs = {'iris': wrangle_iris,
              'titanic': 'wrangle_titanic',
              'telco': 'wrangle_telco'}
def wrangle_data(some_string):
    '''
    wrangle data requires a single string argument
    of the expected options iris, titanic, or telco.
    it will then reference this string value as the key
    associated with values that hold a wrangle function
    '''
    return my_functs[some_string]()

In [32]:
new_df = wrangle_iris()

In [36]:
new_df = wrangle_data('iris')

In [34]:
new_df

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [22]:
df = acquire.get_iris_data()
iris = prep_iris(df)

In [23]:
iris

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


Using the Titanic dataset

    Use the function defined in acquire.py to load the Titanic data.

    Drop any unnecessary, unhelpful, or duplicated columns.

    Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

    Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [53]:
def split_titanic_data(df, target='survived'):
    '''
    split titanic data will split data based on 
    the values present in a cleaned version of titanic
    that is from clean_titanic
    
    '''
    train_val, test = train_test_split(df,
                                   train_size=0.8,
                                   random_state=1349,
                                   stratify=df[target])
    train, validate = train_test_split(train_val,
                                   train_size=0.7,
                                   random_state=1349,
                                   stratify=train_val[target])
    return train, validate, test

In [90]:
from sklearn.model_selection import train_test_split

In [110]:
def clean_titanic(df):
    '''
    clean titanic will take in a single pandas dataframe
    and will proceed to drop redundant columns
    and nonuseful information
    in addition to addressing null values
    and encoding categorical variables
    '''
    #drop out any redundant, excessively empty, or bad columns
    df = df.drop(columns=['passenger_id','embarked','deck','class'])
    # impute average age and most common embark_town:
    train, validate, test = split_titanic_data(df)
    my_age_imputer = SimpleImputer(strategy='mean')
    my_age_imputer.fit(train[['age']])
    train.loc[:,'age'] = my_age_imputer.transform(train[['age']])
    validate.loc[:,'age'] = my_age_imputer.transform(validate[['age']])
    test.loc[:,'age'] = my_age_imputer.transform(test[['age']])                                              
    my_embark_imputer = SimpleImputer(strategy='most_frequent')
    my_embark_imputer.fit(train[['embark_town']])
    train.loc[:,'embark_town'] = my_embark_imputer.transform(train[['embark_town']])
    validate.loc[:,'embark_town'] = my_embark_imputer.transform(validate[['embark_town']])
    test.loc[:,'embark_town'] = my_embark_imputer.transform(test[['embark_town']])
    # encode categorical values:                                      
    train = pd.concat(
    [train, pd.get_dummies(train[['sex', 'embark_town']],
                        drop_first=True)], axis=1)
    validate = pd.concat(
    [validate, pd.get_dummies(validate[['sex', 'embark_town']],
                        drop_first=True)], axis=1)
    test = pd.concat(
    [test, pd.get_dummies(test[['sex', 'embark_town']],
                        drop_first=True)], axis=1)                                                  
    return train, validate, test

In [101]:
titanic.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

In [102]:
titanic.loc[:,'embark_town']

0      Southampton
1        Cherbourg
2      Southampton
3      Southampton
4      Southampton
          ...     
886    Southampton
887    Southampton
888    Southampton
889      Cherbourg
890     Queenstown
Name: embark_town, Length: 891, dtype: object

In [111]:
clean_titanic(acquire.get_titanic_data())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


(     survived  pclass     sex       age  sibsp  parch     fare  embark_town  \
 474         0       3  female  22.00000      0      0   9.8375  Southampton   
 370         1       1    male  25.00000      1      0  55.4417    Cherbourg   
 573         1       3  female  30.53944      0      0   7.7500   Queenstown   
 110         0       1    male  47.00000      0      0  52.0000  Southampton   
 167         0       3  female  45.00000      1      4  27.9000  Southampton   
 ..        ...     ...     ...       ...    ...    ...      ...          ...   
 735         0       3    male  28.50000      0      0  16.1000  Southampton   
 163         0       3    male  17.00000      0      0   8.6625  Southampton   
 770         0       3    male  24.00000      0      0   9.5000  Southampton   
 196         0       3    male  30.53944      0      0   7.7500   Queenstown   
 94          0       3    male  59.00000      0      0   7.2500  Southampton   
 
      alone  sex_male  embark_town_Que

In [37]:
from sklearn.impute import SimpleImputer

In [49]:
# sklearn 101 process:
# make the thing
# titanic
# we invoke SimpleImputer, and put it into a variable
my_imputer = SimpleImputer(strategy='mean')
# fit the thing
# fit computes the mean inside of titanic's age
# and stores that value in the object
my_imputer.fit(titanic[['age']])
# use the thing
# transform will use that stored fit value
# to make any fills to our missing data
# titanic['age'] = my_imputer.transform(titanic[['age']])

SimpleImputer()

In [52]:
# sklearn 101 process:
# make the thing
# titanic
# we invoke SimpleImputer, and put it into a variable
my_imputer = SimpleImputer(strategy='most_frequent')
# fit the thing
# fit computes the mean inside of titanic's age
# and stores that value in the object
my_imputer.fit(titanic[['embark_town']])
# use the thing
# transform will use that stored fit value
# to make any fills to our missing data
# my_imputer.transform(titanic[['embark_town']])

SimpleImputer(strategy='most_frequent')

In [85]:
def prep_titanic(df):
    df = clean_titanic(df)
    return split_titanic_data(df)

Using the Telco dataset

    Use the function defined in acquire.py to load the Telco data.

    Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

    Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

    Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

Split your data

    Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.

    Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

    Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

    Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [56]:
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [57]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [60]:
# potential issues with this information:
# there appear to be no nulls
telco.isna().sum()[telco.isna().sum() > 0]

Series([], dtype: int64)

In [61]:
# things I want to do from this point:
# i want to establish which are my categorical features
# decide/elect how to encode which

In [64]:
telco.customer_id.nunique()

7043

In [72]:
telco.gender.nunique()

2

In [73]:
maybe_dont_encode = []
binaries = []
categorical_cols = []
for col in telco.columns:
    if telco[col].dtype == 'O':
        if telco[col].nunique() > 5:
            maybe_dont_encode.append(col)
        elif telco[col].nunique() == 2:
            binaries.append(col)
        else:
            categorical_cols.append(col)

In [74]:
maybe_dont_encode

['customer_id', 'total_charges']

In [75]:
categorical_cols

['multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'contract_type',
 'internet_service_type',
 'payment_type']

In [76]:
binaries

['gender',
 'partner',
 'dependents',
 'phone_service',
 'paperless_billing',
 'churn']

In [77]:
telco.gender.unique()

array(['Female', 'Male'], dtype=object)

In [82]:
telco['is_female'] = telco.gender.map(
    {'Female': 1, 'Male': 0})

In [81]:
pd.get_dummies(telco[['gender']], drop_first=True)

Unnamed: 0,gender
0,1
1,0
2,0
3,0
4,1
...,...
7038,1
7039,0
7040,0
7041,0


In [83]:
pd.get_dummies(telco[categorical_cols], drop_first=True)

Unnamed: 0,multiple_lines_No phone service,multiple_lines_Yes,online_security_No internet service,online_security_Yes,online_backup_No internet service,online_backup_Yes,device_protection_No internet service,device_protection_Yes,tech_support_No internet service,tech_support_Yes,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0,0,0,0,0,1,0,0,0,1,...,1,0,0,1,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,0,0,0,0,1,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0
4,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
7039,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
7040,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7041,0,0,0,1,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1


In [119]:
def prep_telco(telco):
    telco['total_charges'] = (telco.total_charges + '0').astype(float)
    telco = telco.drop(columns=['internet_service_type_id', 'contract_type_id', 'payment_type_id'])

    telco['gender_encoded'] = telco.gender.map({'Female': 1, 'Male': 0})
    telco['partner_encoded'] = telco.partner.map({'Yes': 1, 'No': 0})
    telco['dependents_encoded'] = telco.dependents.map({'Yes': 1, 'No': 0})
    telco['phone_service_encoded'] = telco.phone_service.map({'Yes': 1, 'No': 0})
    telco['paperless_billing_encoded'] = telco.paperless_billing.map({'Yes': 1, 'No': 0})
    telco['churn_encoded'] = telco.churn.map({'Yes': 1, 'No': 0})
    
    dummy_df = pd.get_dummies(telco[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type'
                            ]],
                              drop_first=True)
    telco = pd.concat( [telco, dummy_df], axis=1 )
    
    return telco

In [120]:
prep_telco(acquire.get_telco_data())

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,1,0,0,1,0,0,0,0,0,1
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,0,0,1,0,0,0,0,0,0,1
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,0,0,0,0,0,1,0,0,1,0
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,1,0,1,0,0,1,0,0,1,0
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,No,...,0,0,0,1,0,0,0,0,0,1
7039,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,No,...,0,0,1,0,0,1,0,0,1,0
7040,9992-UJOEL,Male,0,No,No,2,Yes,No,No,Yes,...,0,0,0,0,0,0,0,0,0,1
7041,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,No,...,0,0,1,0,1,0,0,0,0,1
