In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

# import our own acquire module
import acquire

Using the Iris Data:

1. Use the function defined in acquire.py to load the iris data.

2. Drop the species_id and measurement_id columns.

3. Rename the species_name column to just species.

4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [None]:
# Use the function defined in acquire.py to load the iris data.
df = acquire.get_iris_data()

In [None]:
# Drop the species_id and measurement_id columns.
df = df.drop(columns = 'species_id')

In [None]:
# Rename the species_name column to just species.
df = df.rename(columns= {'species_name':'species'})

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info

In [None]:
df.describe()

In [None]:
num_cols = df.select_dtypes(include = 'number').columns
num_cols

In [None]:
for col in num_cols:
    plt.hist(df[col])
    plt.title(col)
    plt.show()

In [None]:
obj_cols = df.columns[[df[col].dtype == 'O' for col in df.columns]]
obj_cols

In [None]:
for col in obj_cols:
    print('/------------------------------\\')
    print(df[col].value_counts())
    print('\n')
    print(df[col].value_counts(normalize = True, dropna = False))
    print('\------------------------------/')

In [None]:
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
dummy_df = pd.get_dummies(df['species'], drop_first=False)
dummy_df

In [None]:
df = pd.concat([df, dummy_df], axis =1)
df

In [None]:
'''
Create a function named prep_iris that accepts 
the untransformed iris data, and 
returns the data with the transformations above applied.
'''
def prep_iris(df):
    
    #drop column speciess id
    df = df.drop(columns = 'species_id')
    
    #rename species name column
    df = df.rename(columns ={'species_name' : 'species'})
    
    #create dummiess dataframe
    dummy_df = pd.get_dummies(df['species'], drop_first=False)
    
    #concat dummy with DF
    df = pd.concat([df, dummy_df], axis = 1)
    
    return df

In [None]:
prep_iris(df)

Using the Titanic dataset

Use the function defined in acquire.py to load the Titanic data.

Drop any unnecessary, unhelpful, or duplicated columns.

Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [None]:
df = acquire.get_titanic_data()

In [None]:
df.head()

In [None]:
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

In [None]:
df = df.drop(columns = ['age', 'deck', 'embarked', 'class'])
df.shape

In [None]:
df.head()

Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [None]:
df_dummy = pd.get_dummies(df[['sex', 'embark_town']], dummy_na= False, drop_first=[True,True])
df_dummy

In [None]:
df = pd.concat([df, df_dummy], axis =1)
df.head()

In [None]:
def prep_titanic(df):
    df = df.drop_duplicates()
    df = df.drop(columns = ['age', 'deck', 'embarked', 'class'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    
    df_dummy = pd.get_dummies(df[['sex', 'embark_town']], dummy_na= False, drop_first=[True,True])
    df = pd.concat([df, df_dummy], axis= 1)
    
    return df

In [None]:
prep_titanic(df)

Using the Telco dataset

Use the function defined in acquire.py to load the Telco data.

Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [2]:
df = acquire.get_telco_data()
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [None]:
df.info()

In [None]:
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
df = df.drop_duplicates()

In [None]:
df = df.drop(columns =['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], axis = 1)
df.head()

In [None]:
df.head()

In [None]:
dummy_df = pd.get_dummies(df[['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines','online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn', 'internet_service_type', 'payment_type']], dummy_na=False, drop_first=True)



In [None]:
dummy_df.value_counts

In [None]:
df = pd.concat([df, dummy_df], axis =1)

In [None]:
df.head()

In [4]:
def prep_telco(df):
    df = df.drop_duplicates()
    df = df.drop(columns =['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], axis = 1)
    
    df['total_charges'] = df['total_charges'].str.strip()
    df = df[df.total_charges != '']
    
    df['total_charges'] = df.total_charges.astype(float)
    
    dummy_df = pd.get_dummies(df[['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines','online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn', 'internet_service_type', 'payment_type']], dummy_na=False, drop_first=True)
    df = pd.concat([df, dummy_df], axis =1)
    
    return df
    

In [5]:
prep_telco(df)

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,1,0,0,1,0,0,0,0,0,1
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,0,0,1,0,0,0,0,0,0,1
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,0,0,0,1,1,1,0,0,1,0
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,...,1,0,1,1,1,1,0,0,1,0
4,Female,1,Yes,No,3,Yes,No,No,No,No,...,1,0,0,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,0,No,No,13,Yes,No,Yes,No,No,...,0,0,0,0,0,0,0,0,0,1
7039,Male,0,Yes,No,22,Yes,Yes,No,No,No,...,0,0,1,1,1,1,0,0,1,0
7040,Male,0,No,No,2,Yes,No,No,Yes,No,...,0,0,0,1,0,0,0,0,0,1
7041,Male,0,Yes,Yes,67,Yes,No,Yes,No,Yes,...,0,0,1,0,0,0,0,0,0,1
