<a href="https://www.kaggle.com/code/amirmotefaker/titanic-machine-learning-from-disaster?scriptVersionId=123442509" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from IPython.display import display

def display_tab(df):
    display(df.head(7))
    print("DataFrame shape: {}".format(df.shape))

In [None]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
display_tab(df)

In [None]:
display(df.isnull().sum().sort_values(ascending=False))
df.describe(include='all')

In [None]:
titanic = df.copy()

In [None]:
# Dropping and removing NaN values
titanic = titanic.drop(['PassengerId','Ticket','Cabin'], axis=1)
titanic = titanic.dropna()

In [None]:
# Transforming categorical values
titanic['Survived'] = titanic['Survived'].map({1: 'Survived', 0: 'Died'})
titanic['Pclass'] = titanic['Pclass'].map({1: 'First', 2: 'Second', 3: 'Third'})

In [None]:
display_tab(titanic)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

colors = ["vermillion", "windows blue", "leaf green", "iris", "amber", "greyish"]
sns.set_palette(sns.xkcd_palette(colors))

display(sns.pairplot(titanic, hue='Survived'))

In [None]:
category_label = ['Survived', 'Sex', 'Pclass', 'Embarked']
category_order = [['Died', 'Survived'], ['male', 'female'], 
                  ['First', 'Second', 'Third'], ['C', 'Q', 'S']]

fig, axes = plt.subplots(1, 4, figsize=(11,3))

for ax, x, order in zip(axes, category_label, category_order):
    sns.countplot(x=x, order=order, hue='Survived', data=titanic, ax=ax)

fig.tight_layout()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(11,3))

for ax, x, order in zip(axes, category_label, category_order):
    sns.countplot(x='Survived', hue=x, hue_order=order, data=titanic, ax=ax)

fig.tight_layout()

### Supervised Machine Learning

In [None]:
# module imports
from patsy import dmatrices
import statsmodels.discrete.discrete_model as sm

# create matrices
#formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + Fare + C(Embarked)'
#y, X = dmatrices(formula, df, return_type = 'dataframe')

formulas = ['Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + Fare + C(Embarked)',
            'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + Fare',
            'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + C(Embarked)',
            'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Fare + C(Embarked)',
            'Survived ~ C(Pclass) + C(Sex) + Age + Parch + Fare + C(Embarked)']
models = []

for formula in formulas:
    y, X = dmatrices(formula, df, return_type = 'dataframe')
    logit = sm.Logit(y, X)
    model = logit.fit()
    print('Formula: {}\n{}\n'.format(formula, model.params))
    models.append(model)

In [None]:
tdf = pd.read_csv("/kaggle/input/titanic/test.csv")
tdf.fillna(tdf.mean(), inplace=True)
tdf['Survived'] = 0.5
display_tab(tdf)

In [None]:
def submission(df, filename="submission.csv", path = "/kaggle/working/"):
    res = df[['PassengerId', 'Survived']].copy()
    res = res.round({'Survived': 0})
    res = res.astype({'Survived': int})
    res.to_csv(path + filename, index=False)

In [None]:
for i, (formula, model) in enumerate(zip(formulas, models)):
    yt, Xt = dmatrices(formula, tdf, return_type='dataframe')
    y_pred = model.predict(Xt)
    tdf['Survived'] = y_pred
    submission(tdf, "{}_logit_regression_submission.csv".format(i+1))

In [None]:
from sklearn.linear_model import LogisticRegression
# sklearn output
model = LogisticRegression(solver='liblinear', fit_intercept = False, C = 1e9)
mdl = model.fit(X, y.values.ravel())
#display(model.coef_)

y_pred = mdl.predict(Xt)

tdf['Survived'] = y_pred
display_tab(tdf)

submission(tdf, "sk_logit_regression_submission.csv")

## scikit-learn classifiers

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import numpy as np

In [None]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear"),
    SVC(kernel="rbf"),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB()]

In [None]:
formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + Fare + C(Embarked)'
y, X = dmatrices(formula, df, return_type = 'dataframe')
y = y.values.ravel()

accuracies = []

for name, clf in zip(names, classifiers):
    # Training
    clf.fit(X, y)
    # Testing
    pred = clf.predict(X)
    accuracy = accuracy_score(y, pred) * 100
    accuracies.append(accuracy)
    # [example] Accuracy for [model]: [Accuracy]
    print('Accuracy for {}: {}'
          .format(name, accuracy))

In [None]:
for i, (name, clf) in enumerate(zip(names, classifiers)):
    yt, Xt = dmatrices(formula, tdf, return_type='dataframe')
    pred = clf.predict(Xt)
    print(name)
    tdf['Survived'] = pred
    display_tab(tdf)
    submission(tdf, "{}_{}_submission.csv".format(i+1, name))