In [80]:
from sklearn_pandas import DataFrameMapper, cross_val_score
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, chi2, f_classif

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv("train.csv")\
         .drop(["Cabin", "Name", "Ticket"], axis=1)
    
train["Age"] = train["Age"].fillna(train["Age"].mean())
train["Embarked"] = train["Embarked"].fillna("S")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [3]:
test = pd.read_csv("test.csv") \
         .drop(["Cabin", "Name", "Ticket"], axis=1)
    
test["Age"] = test["Age"].fillna(test["Age"].mean())
test["Fare"] = test["Fare"].fillna(test["Fare"].mean())

test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [5]:
transformer = DataFrameMapper([
    ("PassengerId", None),
    ("Sex", sk.preprocessing.LabelBinarizer()),
    ("Embarked", sk.preprocessing.LabelBinarizer()),
], df_out=True)

In [8]:
df_train = train.merge(
    transformer.fit_transform(train.copy()),
    on="PassengerId"
).drop(["Embarked", "Sex_x", "PassengerId"], axis=1)

df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_y,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,0,0
2,1,3,26.0,0,0,7.925,0,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,0,0,1


In [9]:
df_test = test.merge(
    transformer.fit_transform(test.copy()),
    on="PassengerId"
).drop(["Embarked", "Sex_x", "PassengerId"], axis=1)

df_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_y,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,1,0,1,0
1,3,47.0,1,0,7.0,0,0,0,1
2,2,62.0,0,0,9.6875,1,0,1,0
3,3,27.0,0,0,8.6625,1,0,0,1
4,3,22.0,1,1,12.2875,0,0,0,1


In [85]:
results = DataFrameMapper([
    (["Pclass", "Age", "SibSp", "Parch", "Fare",
      "Sex_y", "Embarked_C", "Embarked_Q", "Embarked_S"], SelectKBest(chi2, k=5))
])

In [86]:
n_estimators = 35

X_train = results.fit_transform(df_train, df_train["Survived"])
y_train = df_train["Survived"]

grd = GradientBoostingClassifier(n_estimators=n_estimators)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression(max_iter=500)

grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train)[:, :, 0]), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [87]:
submission = pd.DataFrame({
    "PassengerId": range(892, 892 + 418), 
    "Survived": grd_lm.predict(grd_enc.transform(grd.apply(results.transform(df_test))[:, :, 0]))
})

In [88]:
with open("submission_18.csv", "w") as f_ptr:
    f_ptr.write(submission.to_csv(index=False))