Prepare Environment

In [1]:
import pandas as pd
import numpy as np
import acquire
# import warnings
# warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

Acquire Data

In [5]:
df = acquire.get_titanic_data()

Summarize

- info
- describe
- head/tail
- value_counts
- shape
- isnull

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
deck            203 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [7]:
df.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [8]:
df.dtypes

passenger_id      int64
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class            object
deck             object
embark_town      object
alone             int64
dtype: object

In [9]:
df.shape

(891, 13)

In [10]:
df.isnull().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [11]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passenger_id,891.0,445.0,257.353842,0.0,222.5,445.0,667.5,890.0
survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
alone,891.0,0.602694,0.489615,0.0,0.0,1.0,1.0,1.0


In [12]:
# df.age.value_counts(sort=False).sort_index()
df.age.value_counts(bins=10, sort=True)

(16.336, 24.294]    177
(24.294, 32.252]    169
(32.252, 40.21]     118
(40.21, 48.168]      70
(0.339, 8.378]       54
(8.378, 16.336]      46
(48.168, 56.126]     45
(56.126, 64.084]     24
(64.084, 72.042]      9
(72.042, 80.0]        2
Name: age, dtype: int64

In [13]:
df.embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: embarked, dtype: int64

Prepare Data

- drop columns
- fillna
- split
- impute mean, mode, median: SimpleImputer
- integer encoding: LabelEncoder
- one hot encoding: OneHotEncoder
- scale

In [14]:
df.drop(columns=['deck'], inplace=True)

In [15]:
df.fillna(np.nan, inplace=True)

In [16]:
train, test = train_test_split(df, train_size=.8, random_state=123)



Impute mean, mode, median using SimpleImputer

In [17]:
train.embarked.value_counts(dropna=False)

S      515
C      128
Q       67
NaN      2
Name: embarked, dtype: int64

In [18]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp_mode.fit(train[['embarked']])

train['embarked'] = imp_mode.transform(train[['embarked']])

test['embarked'] = imp_mode.transform(test[['embarked']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [19]:
train.embarked.value_counts()

S    517
C    128
Q     67
Name: embarked, dtype: int64

In [20]:
imp_median = SimpleImputer(missing_values=np.nan, strategy = 'median')
train['age'] = imp_median.fit_transform(train[['age']])
train.age.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0

Encoding

1. integer encoding
1. one hot encoding

In [21]:
int_encoder = LabelEncoder()
int_encoder.fit(train.embarked)
train.embarked = int_encoder.transform(train.embarked)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [22]:
train.embarked.value_counts()

2    517
0    128
1     67
Name: embarked, dtype: int64

In [23]:
embarked_array = np.array(train.embarked)
embarked_array[0:5]

array([0, 1, 0, 1, 0])

In [24]:
embarked_array = embarked_array.reshape(len(embarked_array), 1)

In [26]:
embarked_array

array([[0],
       [1],
       [0],
       [1],
       [0],
       [2],
       [0],
       [2],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [0],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [2],
       [1],
       [0],
       [0],
       [2],
       [0],
       [0],
       [2],
       [0],
       [1],
       [2],
       [1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [2],
       [0],
       [2],
       [2],
       [0],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [2],
       [0],
       [1],
       [0],
       [2],
       [2],
    

In [25]:
ohe = OneHotEncoder(sparse=False, categories='auto')

In [None]:
embarked_ohe = ohe.fit_transform(embarked_array)
embarked_ohe

In [None]:
test.embarked = int_encoder.transform(test.embarked)

In [None]:
embarked_array = np.array(test.embarked).reshape(len(test.embarked), 1)

In [None]:
embarked_test_ohe = ohe.transform(embarked_array)

In [None]:
embarked_test_ohe[0:5]