In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv("datas/train.csv")
test_df = pd.read_csv("datas/test.csv")
df = pd.concat((train_df.loc[:, 'Pclass':'Embarked'], test_df.loc[:, 'Pclass':'Embarked']), ignore_index=True)
df.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

# Fill missing values

In [3]:
df['Age'].fillna(df['Age'].median(), inplace=True)

In [4]:
df['Embarked'].value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: Embarked, dtype: int64

In [5]:
df['Embarked'].fillna('S', inplace=True)

In [6]:
df['Fare'].fillna(df['Fare'].median(), inplace=True)

# Add features

In [7]:
def fare_categorized(x):
    if x <= 7.9:
        return 0
    elif 7.9 < x <= 14.45:
        return 1
    elif 14.45 < x <= 31.28:
        return 2
    else:
        return 3
df['Fare_cat'] = df['Fare']
df['Fare_cat'] = df['Fare_cat'].map(fare_categorized)

In [8]:
import re
df['Cabin'].fillna('U', inplace=True)
df['Cabin'] = df['Cabin'].map(lambda x: re.compile("[a-zA-Z]+").search(x).group(0))

In [9]:
def age_five(x):
    t = (x // 10) 
    o = 1 if (x % 10) >=5 else 0
    return t * 2 + o
def age_decade(x):
    return x // 10
df['Age_decade'] = df['Age']
df['Age_decade'] = df['Age_decade'].map(age_decade)
df['Age_five'] = df['Age']
df['Age_five'] = df['Age_decade'].map(age_five)

In [10]:
df.drop('Ticket', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)

In [11]:
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})

In [12]:
df['Cabin'].value_counts(dropna=False)

U    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Cabin, dtype: int64

In [13]:
df['Cabin'] = df['Cabin'].map({'U': 0, 'A':1, 'B': 2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8})

In [14]:
df['Embarked'].value_counts()

S    916
C    270
Q    123
Name: Embarked, dtype: int64

In [15]:
df['Embarked'] = df['Embarked'].map({'C': 0, 'S': 1, 'Q': 2})

# Training

In [16]:
X_train = df[:train_df.shape[0]]
y_train = train_df['Survived']
X_test = df[train_df.shape[0]:]

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=1234)

In [18]:
from sklearn import metrics
def print_metrics(pred, gt):
    print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(gt, pred))) 
    print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(gt, pred)))
    print('Precision : {0:0.5f}'.format(metrics.precision_score(gt, pred)))
    print('Recall : {0:0.5f}'.format(metrics.recall_score(gt, pred)))
    print('F1 : {0:0.5f}'.format(metrics.f1_score(gt, pred)))
    
def save_submission(pred, name):
    save = pd.DataFrame({"PassengerId": test_df['PassengerId'],
                   "Survived": pred.astype(int)})
    save.to_csv(f"outputs/submission_{name}.csv", index=False)

In [19]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=12, random_state=1234)
dt.fit(x_train,y_train)
dt_valid_pred = dt.predict(x_valid)
print_metrics(dt_valid_pred, y_valid) 

Accuracy :0.84444
AUC : 0.83117
Precision : 0.81818
Recall : 0.77143
F1 : 0.79412


In [20]:
dt_pred = dt.predict(X_test)
save_submission(dt_pred, 'dt')

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50,
                            max_depth=5,
                            max_samples=0.9,
                            random_state=1234)
rf.fit(x_train, y_train)
rf_val_pred = rf.predict(x_valid)
print_metrics(rf_val_pred, y_valid)

Accuracy :0.86667
AUC : 0.84935
Precision : 0.87097
Recall : 0.77143
F1 : 0.81818


In [22]:
rf_pred = rf.predict(X_test)
save_submission(rf_pred, 'rf')

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=5000,
                                 subsample=0.67,
                                 learning_rate=0.02,
                                 max_depth=4,
                                 validation_fraction=0.05,
                                 n_iter_no_change=10,
                                 verbose=0,
                                 random_state=1234)
gbc.fit(x_train, y_train)
gbc_val_pred = gbc.predict(x_valid)
print_metrics(gbc_val_pred, y_valid)

Accuracy :0.86667
AUC : 0.84416
Precision : 0.89655
Recall : 0.74286
F1 : 0.81250


In [24]:
gbc_pred = gbc.predict(X_test)
save_submission(gbc_pred, 'gbc')

In [26]:
from sklearn.svm import LinearSVC
svc = LinearSVC(loss='hinge',
                tol=1e-4, 
                C=0.5,
                max_iter=500000,
                random_state=1234)
svc.fit(x_train, y_train)
svc_val_pred = svc.predict(x_valid)
print_metrics(svc_val_pred, y_valid)

Accuracy :0.85556
AUC : 0.84026
Precision : 0.84375
Recall : 0.77143
F1 : 0.80597




In [27]:
svc_pred = svc.predict(X_test)
save_submission(svc_pred, 'svc')