<img src="https://images3.memedroid.com/images/UPLOADED517/5e6ea1d669d72.jpeg" width="300"/>

In [1]:
import pandas as pd

df = pd.read_csv("data/titanic/train.csv")
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 1st step - Select features and groundtruth from raw data

In [7]:
print(df["Survived"].value_counts())
print(df["Embarked"].value_counts())

# Ground Truth
y = df["Survived"]

# Features
X = df[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Cabin"]]

0    549
1    342
Name: Survived, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [8]:
X["Cabin"].value_counts()

G6             4
B96 B98        4
C23 C25 C27    4
F33            3
E101           3
              ..
D50            1
E77            1
A23            1
C106           1
B50            1
Name: Cabin, Length: 147, dtype: int64

# 2nd step - Clean data to numeric values

In [9]:
import numpy as np

# Clean the cabin feature
X["Cabin"] = np.where(X["Cabin"].isnull(),0,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Cabin"] = np.where(X["Cabin"].isnull(),0,1)


In [10]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin
0,3,male,22.0,1,0,7.2500,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.9250,S,0
3,1,female,35.0,1,0,53.1000,S,1
4,3,male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S,0
887,1,female,19.0,0,0,30.0000,S,1
888,3,female,,1,2,23.4500,S,0
889,1,male,26.0,0,0,30.0000,C,1


In [11]:
# Convert categoric features to numeric data
X = pd.get_dummies(X,drop_first=True)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,1,0,1
1,1,38.0,1,0,71.2833,1,0,0,0
2,3,26.0,0,0,7.9250,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,1
...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,1,0,1
887,1,19.0,0,0,30.0000,1,0,0,1
888,3,,1,2,23.4500,0,0,0,1
889,1,26.0,0,0,30.0000,1,1,0,0


In [12]:
X.isnull().sum(axis=0)

Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
Cabin           0
Sex_male        0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [13]:
# Fill age gaps with age mean
X["Age"].fillna(X["Age"].mean(),inplace=True)

In [14]:
# Check for null values
X.isnull().sum(axis=0)

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin         0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [15]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,3,22.000000,1,0,7.2500,0,1,0,1
1,1,38.000000,1,0,71.2833,1,0,0,0
2,3,26.000000,0,0,7.9250,0,0,0,1
3,1,35.000000,1,0,53.1000,1,0,0,1
4,3,35.000000,0,0,8.0500,0,1,0,1
...,...,...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,0,1,0,1
887,1,19.000000,0,0,30.0000,1,0,0,1
888,3,29.699118,1,2,23.4500,0,0,0,1
889,1,26.000000,0,0,30.0000,1,1,0,0


In [16]:
X["Age"].mean()

29.69911764705882

In [17]:
X["Pclass"].mean()

2.308641975308642

In [18]:
from sklearn.preprocessing import StandardScaler, Normalizer

scaler = StandardScaler()
result = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
display(result)
display(result.mean())
display(result.std())
np.linalg.norm(result.iloc[0])

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,0.827377,-0.592481,0.432793,-0.473674,-0.502445,-0.544925,0.737695,-0.307562,0.619306
1,-1.566107,0.638789,0.432793,-0.473674,0.786845,1.835115,-1.355574,-0.307562,-1.614710
2,0.827377,-0.284663,-0.474545,-0.473674,-0.488854,-0.544925,-1.355574,-0.307562,0.619306
3,-1.566107,0.407926,0.432793,-0.473674,0.420730,1.835115,-1.355574,-0.307562,0.619306
4,0.827377,0.407926,-0.474545,-0.473674,-0.486337,-0.544925,0.737695,-0.307562,0.619306
...,...,...,...,...,...,...,...,...,...
886,-0.369365,-0.207709,-0.474545,-0.473674,-0.386671,-0.544925,0.737695,-0.307562,0.619306
887,-1.566107,-0.823344,-0.474545,-0.473674,-0.044381,1.835115,-1.355574,-0.307562,0.619306
888,0.827377,0.000000,0.432793,2.008933,-0.176263,-0.544925,-1.355574,-0.307562,0.619306
889,-1.566107,-0.284663,-0.474545,-0.473674,-0.044381,1.835115,0.737695,-0.307562,-1.614710


Pclass       -2.031048e-16
Age           2.562796e-16
SibSp         3.456519e-16
Parch         6.716164e-17
Fare         -4.373606e-17
Cabin        -2.583044e-16
Sex_male     -4.059603e-16
Embarked_Q   -4.017238e-16
Embarked_S    5.632108e-17
dtype: float64

Pclass        1.000562
Age           1.000562
SibSp         1.000562
Parch         1.000562
Fare          1.000562
Cabin         1.000562
Sex_male      1.000562
Embarked_Q    1.000562
Embarked_S    1.000562
dtype: float64

1.737523124102104

In [19]:
from sklearn.pipeline import make_pipeline

pipeline = [
    StandardScaler(),
    Normalizer()
]

transformer = make_pipeline(*pipeline)

X_data = transformer.fit_transform(X)

X_data = pd.DataFrame(X_data, columns=X.columns)
X_data 

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,0.476182,-0.340991,0.249086,-0.272614,-0.289173,-0.313622,0.424567,-0.177012,0.356431
1,-0.455937,0.185969,0.125998,-0.137900,0.229073,0.534253,-0.394645,-0.089540,-0.470087
2,0.410278,-0.141158,-0.235316,-0.234884,-0.242412,-0.270216,-0.672199,-0.152513,0.307100
3,-0.525222,0.136805,0.145145,-0.158855,0.141099,0.615438,-0.454615,-0.103146,0.207695
4,0.489554,0.241367,-0.280786,-0.280270,-0.287763,-0.322429,0.436490,-0.181983,0.366440
...,...,...,...,...,...,...,...,...,...
886,-0.255012,-0.143404,-0.327630,-0.327028,-0.266960,-0.376220,0.509310,-0.212343,0.427574
887,-0.514503,-0.270488,-0.155899,-0.155613,-0.014580,0.602878,-0.445337,-0.101041,0.203457
888,0.301085,0.000000,0.157495,0.731058,-0.064143,-0.198300,-0.493298,-0.111923,0.225368
889,-0.505486,-0.091880,-0.153167,-0.152886,-0.014325,0.592312,0.238103,-0.099271,-0.521173


In [20]:
np.linalg.norm(X_data.iloc[0])

0.9999999999999999

In [21]:

from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# This class receives original dataframe and cleans the data
class CleanTitanic(TransformerMixin):
    def __init__(self, strategy=1):
        """
         strategy 1: use all possible features: ["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]
         strategy 2: only use basic features: ["Age","Cabin","Sex"]
        """
        if strategy == 1:
            self.features = ["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]
        elif strategy == 2:
            self.features = ["Age","Cabin","Sex"]
        else:
            raise ValueError("Invalid clean strategy")
    
    def fit(self, X):
        # Get only intereesting data 
        X = X[self.features]
        # Clean age feature and put the mean on missing values
        X["Age"].fillna(X["Age"].mean(), inplace=True)
        # Clean cabin data
        X["Cabin"] = np.where(X["Cabin"].isnull(),0,1)
        self.X = pd.get_dummies(X, drop_first=True)
        return self
    
    def transform(self, df):
        return self.X

In [74]:
from sklearn.decomposition import PCA

pipe = make_pipeline(*[
    CleanTitanic(strategy=2),
    StandardScaler(),
    Normalizer(),
    PCA(n_components=3)
])

X_ready = pipe.fit_transform(pd.read_csv("data/titanic/train.csv"))

X_ready.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Cabin"] = np.where(X["Cabin"].isnull(),0,1)


(891, 3)

In [75]:
X_ready

array([[-0.6680226 , -0.3820376 ,  0.26995142],
       [ 1.06843738,  0.52458119,  0.22975609],
       [ 0.83015564, -0.59351311, -0.33698824],
       ...,
       [ 0.85513513, -0.46223996, -0.47504148],
       [ 0.26531705,  0.62885616,  0.87838737],
       [-0.75860219,  0.14903779, -0.24579489]])

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_ready, y, test_size=0.2)

In [77]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(712, 3)
(179, 3)
(712,)
(179,)


In [78]:
print(X_train[:10])
print(y_train[:10])

[[ 0.27566704  0.74081678  0.79031831]
 [-0.41230558  0.57979519 -0.56472269]
 [-0.50167114 -0.52620092  0.44352156]
 [ 0.85836951 -0.3695271  -0.55199435]
 [ 0.29922396  1.06579381  0.12655666]
 [ 0.72179879 -0.76974045 -0.04494436]
 [-0.78908876  0.00789883 -0.12164879]
 [ 0.50303221 -0.83737678  0.31983958]
 [-0.76470719 -0.20794124  0.08501748]
 [-0.73559204  0.20735062 -0.29462095]]
55     1
818    0
220    1
657    0
124    0
807    0
760    0
381    1
207    1
130    0
Name: Survived, dtype: int64


In [79]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="lbfgs")

model.fit(X_train, y_train)
print(model.coef_)

[[ 2.08579465 -0.0734751   0.54871502]]


In [80]:
y_pred = model.predict(X_test)
print(y_pred)

[1 0 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 0
 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0
 1 0 0 1 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0
 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 1 1 0 0 0
 1 1 0 1 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0]


In [83]:
result = pd.DataFrame({
    "y_pred":y_pred,
    "gt":y_test
})
result

Unnamed: 0,y_pred,gt
370,1.0,1
524,1.0,0
42,1.0,0
323,1.0,1
679,1.0,1
...,...,...
148,1.0,0
163,1.0,0
338,1.0,1
357,1.0,0


In [84]:
print(len(y_pred))
1-np.abs(y_pred-y_test).sum()/len(y_pred)

179


0.7541899441340782