In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
# Data Dictionary
# Variable	Definition	Key
# survival	Survival	0 = No, 1 = Yes
# pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
# sex	Sex	
# Age	Age in years	
# sibsp	# of siblings / spouses aboard the Titanic	
# parch	# of parents / children aboard the Titanic	
# ticket	Ticket number	
# fare	Passenger fare	
# cabin	Cabin number	
# embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
# Variable Notes
# pclass: A proxy for socio-economic status (SES)
# 1st = Upper
# 2nd = Middle
# 3rd = Lower

# age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

# sibsp: The dataset defines family relations in this way...
# Sibling = brother, sister, stepbrother, stepsister
# Spouse = husband, wife (mistresses and fiancés were ignored)

# parch: The dataset defines family relations in this way...
# Parent = mother, father
# Child = daughter, son, stepdaughter, stepson
# Some children travelled only with a nanny, therefore parch=0 for them.

In [4]:
# laod the data

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.Ticket.value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [11]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [12]:
train["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [13]:
#  keep Cabin column but mark which rows don't have cabin values
train["HasCabin"] = train["Cabin"].notna().astype(int)
test["HasCabin"] = test["Cabin"].notna().astype(int)

train["Embarked"] = train["Embarked"].fillna("S")
test["Embarked"] = test["Embarked"].fillna("S")


In [14]:
print(train["Age"].median())
print(test["Age"].median())


28.0
27.0


In [15]:
train_age_median = train["Age"].median()

train['Age'] = train['Age'].fillna(train_age_median)
test['Age'] = test['Age'].fillna(train_age_median)



In [16]:
train['Age'].isna().sum(), test['Age'].isna().sum(), 

(np.int64(0), np.int64(0))

In [17]:
# Lets ensure there are no more nan values
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
HasCabin         0
dtype: int64

In [18]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
HasCabin         0
dtype: int64

In [19]:
train_median_fare = train["Fare"].median()
test['Fare'] = test['Fare'].fillna(train_median_fare)

In [20]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
HasCabin         0
dtype: int64

In [21]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'HasCabin'],
      dtype='object')

In [22]:
train = train.drop(columns=["Cabin", "Name", "Ticket"])
test = test.drop(columns=["Cabin", "Name", "Ticket"])


In [23]:
train.dtypes


PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
HasCabin         int64
dtype: object

In [24]:
train["Sex"].value_counts(), train["Embarked"].value_counts()

(Sex
 male      577
 female    314
 Name: count, dtype: int64,
 Embarked
 S    646
 C    168
 Q     77
 Name: count, dtype: int64)

In [25]:
y = train['Survived']
X = train.drop(columns=['Survived'])


In [26]:
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)
X_test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)

In [27]:
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

In [28]:
X.shape, X_test.shape


((891, 10), (418, 10))

In [29]:
X

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,HasCabin,Sex_male,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.2500,0,True,False,True
1,2,1,38.0,1,0,71.2833,1,False,False,False
2,3,3,26.0,0,0,7.9250,0,False,False,True
3,4,1,35.0,1,0,53.1000,1,False,False,True
4,5,3,35.0,0,0,8.0500,0,True,False,True
...,...,...,...,...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000,0,True,False,True
887,888,1,19.0,0,0,30.0000,1,False,False,True
888,889,3,28.0,1,2,23.4500,0,False,False,True
889,890,1,26.0,0,0,30.0000,1,True,False,False


In [30]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [31]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=30,
    random_state=42,
    n_jobs=1
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,30
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

val_pred = rf.predict(X_val)

accuracy_score(y_true=y_val, y_pred=val_pred), confusion_matrix(y_true=y_val, y_pred=val_pred), precision_recall_fscore_support(y_true=y_val, y_pred=val_pred)


(0.7653631284916201,
 array([[92, 18],
        [24, 45]]),
 (array([0.79310345, 0.71428571]),
  array([0.83636364, 0.65217391]),
  array([0.81415929, 0.68181818]),
  array([110,  69])))

In [33]:
import pandas as pd

fi = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
fi

Sex_male       0.244897
PassengerId    0.178675
Fare           0.178609
Age            0.171663
Pclass         0.080155
HasCabin       0.050916
SibSp          0.036394
Parch          0.027858
Embarked_S     0.022160
Embarked_Q     0.008673
dtype: float64

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

val_pred = rf.predict(X_val)
accuracy_score(y_val, val_pred), confusion_matrix(y_val, val_pred)


(0.7821229050279329,
 array([[92, 18],
        [21, 48]]))

In [35]:
X["FamilySize"] = X["SibSp"] + X["Parch"] + 1
X["IsAlone"] = (X["FamilySize"] == 1).astype(int)

X_test["FamilySize"] = X_test["SibSp"] + X_test["Parch"] + 1
X_test["IsAlone"] = (X_test["FamilySize"] == 1).astype(int)


In [36]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

val_pred = rf.predict(X_val)
accuracy_score(y_val, val_pred), confusion_matrix(y_val, val_pred)


(0.7988826815642458,
 array([[95, 15],
        [21, 48]]))

In [37]:
candidates = [
    {"max_depth": None, "min_samples_leaf": 1},
    {"max_depth": 6, "min_samples_leaf": 1},
    {"max_depth": 6, "min_samples_leaf": 2},
    {"max_depth": 8, "min_samples_leaf": 1},
    {"max_depth": 8, "min_samples_leaf": 2},
]

results = []
for params in candidates:
    rf = RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        **params
    )
    rf.fit(X_train, y_train)
    pred = rf.predict(X_val)
    results.append((params, accuracy_score(y_val, pred)))

sorted(results, key=lambda x: -x[1])

[({'max_depth': 6, 'min_samples_leaf': 2}, 0.8212290502793296),
 ({'max_depth': 6, 'min_samples_leaf': 1}, 0.8156424581005587),
 ({'max_depth': 8, 'min_samples_leaf': 1}, 0.7932960893854749),
 ({'max_depth': None, 'min_samples_leaf': 1}, 0.7877094972067039),
 ({'max_depth': 8, 'min_samples_leaf': 2}, 0.7877094972067039)]

In [38]:
best_rf = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    max_depth=6,
    min_samples_leaf=2
)

best_rf.fit(X, y)
test_pred = best_rf.predict(X_test)


In [39]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_pred
})
submission.to_csv("submission.csv", index=False)

submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [40]:
import xgboost as xgb
xgb.__version__


'3.1.2'

In [41]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

In [42]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = XGBClassifier(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=3,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss",
    )

    model.fit(X_tr, y_tr)
    pred = model.predict(X_va)
    acc = accuracy_score(y_va, pred)
    fold_scores.append(acc)
    print(f"fold {fold}: {acc:.4f}")

print("mean:", np.mean(fold_scores), "std:", np.std(fold_scores))


fold 1: 0.8212
fold 2: 0.8202
fold 3: 0.7865
fold 4: 0.8146
fold 5: 0.8427
mean: 0.8170547988199107 std: 0.018019629840780912


In [43]:
# # compare to auto gluon

# from autogluon.tabular import TabularPredictor
# import pandas as pd

# # laod the data

# train_df = pd.read_csv("data/train.csv")
# test_df = pd.read_csv("data/test.csv")

# predictor = TabularPredictor(label="Survived", eval_metric="accuracy").fit(train_df, presets="best_quality", time_limit=300)
# pred = predictor.predict(test_df)

# submission = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived": pred.astype(int)})
# submission.to_csv("submission.csv", index=False)
# submission.head()
