In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data, get_iris_data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   measurement_id  150 non-null    int64  
 1   sepal_length    150 non-null    float64
 2   sepal_width     150 non-null    float64
 3   petal_length    150 non-null    float64
 4   petal_width     150 non-null    float64
 5   species_id      150 non-null    int64  
 6   species_id.1    150 non-null    int64  
 7   species_name    150 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 10.5+ KB
measurement_id
sepal_length
sepal_width
petal_length
petal_width
species_id
species_id.1
species_name


In [3]:
iris = get_iris_data()
iris.head()

Unnamed: 0.1,Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id,species_id.1,species_name
0,0,1,5.1,3.5,1.4,0.2,1,1,setosa
1,1,2,4.9,3.0,1.4,0.2,1,1,setosa
2,2,3,4.7,3.2,1.3,0.2,1,1,setosa
3,3,4,4.6,3.1,1.5,0.2,1,1,setosa
4,4,5,5.0,3.6,1.4,0.2,1,1,setosa


In [4]:
iris = iris.drop(columns='species_id')
iris.head(2)

Unnamed: 0.1,Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id.1,species_name
0,0,1,5.1,3.5,1.4,0.2,1,setosa
1,1,2,4.9,3.0,1.4,0.2,1,setosa


In [5]:
iris = iris.rename(columns={'species_name': 'species'})
iris.head(2)

Unnamed: 0.1,Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id.1,species
0,0,1,5.1,3.5,1.4,0.2,1,setosa
1,1,2,4.9,3.0,1.4,0.2,1,setosa


In [6]:
species_dummies = pd.get_dummies(iris.species, drop_first=True)
species_dummies.head(3)

Unnamed: 0,versicolor,virginica
0,0,0
1,0,0
2,0,0


In [7]:
iris = pd.concat([iris, species_dummies], axis=1)
iris.head()

Unnamed: 0.1,Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id.1,species,versicolor,virginica
0,0,1,5.1,3.5,1.4,0.2,1,setosa,0,0
1,1,2,4.9,3.0,1.4,0.2,1,setosa,0,0
2,2,3,4.7,3.2,1.3,0.2,1,setosa,0,0
3,3,4,4.6,3.1,1.5,0.2,1,setosa,0,0
4,4,5,5.0,3.6,1.4,0.2,1,setosa,0,0


In [11]:
def prep_iris(cached=True):
    '''
    This function acquires and prepares the iris data from a local csv, default.
    Passing cached=False acquires fresh data from Codeup db and writes to csv.
    Returns the iris df with dummy variables encoding species.
    '''
    
    # use my aquire function to read data into a df from a csv file
    df = get_iris_data()
    
    # drop and rename columns
    df = df.drop(columns='species_id').rename(columns={'species_name': 'species'})
    
    # create dummy columns for species
    species_dummies = pd.get_dummies(df.species, drop_first=True)
    
    # add dummy columns to df
    df = pd.concat([df, species_dummies], axis=1)
    
    return df

In [12]:
iris = prep_iris()
iris.sample(7)

Unnamed: 0.1,Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id.1,species,versicolor,virginica
60,60,61,5.0,2.0,3.5,1.0,2,versicolor,1,0
110,110,111,6.5,3.2,5.1,2.0,3,virginica,0,1
63,63,64,6.1,2.9,4.7,1.4,2,versicolor,1,0
149,149,150,5.9,3.0,5.1,1.8,3,virginica,0,1
31,31,32,5.4,3.4,1.5,0.4,1,setosa,0,0
51,51,52,6.4,3.2,4.5,1.5,2,versicolor,1,0
30,30,31,4.8,3.1,1.6,0.2,1,setosa,0,0


In [13]:
## titanic data
titanic = get_titanic_data()
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [14]:
## handles NAN
titanic[titanic.embark_town.isnull()]

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
61,61,61,1,1,female,38.0,0,0,80.0,,First,B,,1
829,829,829,1,1,female,62.0,0,0,80.0,,First,B,,1


In [15]:
titanic[titanic.embarked.isnull()]

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
61,61,61,1,1,female,38.0,0,0,80.0,,First,B,,1
829,829,829,1,1,female,62.0,0,0,80.0,,First,B,,1


In [41]:
titanic['is_male']= titanic.sex == 'male'

In [42]:
# using the complement operator, ~, to return the inverse of our instance above. Return everything but the null values.

titanic = titanic[~titanic.embarked.isnull()]
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    889 non-null    int64  
 1   passenger_id  889 non-null    int64  
 2   survived      889 non-null    int64  
 3   pclass        889 non-null    int64  
 4   sex           889 non-null    object 
 5   age           712 non-null    float64
 6   sibsp         889 non-null    int64  
 7   parch         889 non-null    int64  
 8   fare          889 non-null    float64
 9   embarked      889 non-null    object 
 10  class         889 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         889 non-null    int64  
 13  Q             889 non-null    uint8  
 14  S             889 non-null    uint8  
 15  is_male       889 non-null    bool   
dtypes: bool(1), float64(2), int64(7), object(4), uint8(2)
memory usage: 99.8+ KB


In [17]:
titanic = titanic.drop(columns='deck')
titanic.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    889 non-null    int64  
 1   passenger_id  889 non-null    int64  
 2   survived      889 non-null    int64  
 3   pclass        889 non-null    int64  
 4   sex           889 non-null    object 
 5   age           712 non-null    float64
 6   sibsp         889 non-null    int64  
 7   parch         889 non-null    int64  
 8   fare          889 non-null    float64
 9   embarked      889 non-null    object 
 10  class         889 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         889 non-null    int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 97.2+ KB


In [18]:
titanic_dummies = pd.get_dummies(titanic.embarked, drop_first=True)
titanic_dummies.sample(10)

Unnamed: 0,Q,S
820,0,1
876,0,1
493,0,0
176,0,1
498,0,1
19,0,0
126,1,0
196,1,0
100,0,1
414,0,1


In [19]:
titanic = pd.concat([titanic, titanic_dummies], axis=1)
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,0,1
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,0,1


In [20]:
train_validate, test = train_test_split(titanic, test_size=.2, 
                                        random_state=123, 
                                        stratify=titanic.survived)

In [21]:
train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.survived)

In [23]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (497, 15)
validate -> (214, 15)
test -> (178, 15)


In [24]:
def titanic_split(df):
    '''
    This function performs split on titanic data, stratify survived.
    Returns train, validate, and test dfs.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.survived)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.survived)
    return train, validate, test

In [25]:
train, validate, test = titanic_split(titanic)

In [26]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (497, 15)
validate -> (214, 15)
test -> (178, 15)


In [27]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0


In [28]:
# Create the imputer object.

imputer = SimpleImputer(strategy = 'mean')

In [29]:
# Fit the imputer to train and transform.

train['age'] = imputer.fit_transform(train[['age']])

In [30]:
# quick check

train['age'].isnull().sum()

0

In [31]:
# Transform the validate and test df age columns

validate['age'] = imputer.transform(validate[['age']])
test['age'] = imputer.transform(test[['age']])

In [32]:
def impute_mean_age(train, validate, test):
    '''
    This function imputes the mean of the age column into
    observations with missing values.
    Returns transformed train, validate, and test df.
    '''
    # create the imputer object with mean strategy
    imputer = SimpleImputer(strategy = 'mean')
    
    # fit on and transform age column in train
    train['age'] = imputer.fit_transform(train[['age']])
    
    # transform age column in validate
    validate['age'] = imputer.transform(validate[['age']])
    
    # transform age column in test
    test['age'] = imputer.transform(test[['age']])
    
    return train, validate, test

In [35]:
def prep_titanic(cached=True):
    '''
    This function reads titanic data into a df from a csv file.
    Returns prepped train, validate, and test dfs
    '''
    # use my acquire function to read data into a df from a csv file
    df = get_titanic_data()
    
    # drop rows where embarked/embark town are null values
    df = df[~df.embarked.isnull()]
    
    # encode embarked using dummy columns
    titanic_dummies = pd.get_dummies(df.embarked, drop_first=True)
    
    # join dummy columns back to df
    df = pd.concat([df, titanic_dummies], axis=1)
    
    # drop the deck column
    df = df.drop(columns='deck')
    
    # split data into train, validate, test dfs
    train, validate, test = titanic_split(df)
    
    # impute mean of age into null values in age column
    train, validate, test = impute_mean_age(train, validate, test)
    
    return train, validate, test

In [36]:
train, validate, test = prep_titanic()

In [37]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (497, 15)
validate -> (214, 15)
test -> (178, 15)
