In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
path = '/kaggle/input/Titanic'

train_data = pd.read_csv(f'{path}/train.csv').set_index('PassengerId', drop=True)
test_data = pd.read_csv(f'{path}/test.csv').set_index('PassengerId', drop=True)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.drop('Ticket', axis=1, inplace=True)
test_data.drop('Ticket', axis=1, inplace=True)

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
pd.pivot_table(train_data, index='Survived')

In [None]:
#Sex valuse to numeric
train_data['Sex'] = train_data['Sex'].apply(lambda x: 1 if x == 'male' else 0 )
test_data['Sex'] = test_data['Sex'].apply(lambda x: 1 if x == 'male' else 0 )

In [None]:
#Age values to categorical
def replace_age(age):
    if age <= 16:
        return 0
    elif age <= 32:
        return 1
    elif age <= 48:
        return 2
    elif age <= 64:
        return 3
    else:
        return 4

train_data['Age'] = train_data['Age'].apply(lambda x: replace_age(x))
test_data['Age'] = test_data['Age'].apply(lambda x: replace_age(x))


In [None]:
#Fare values to categorical
def replace_fare(fare):
    if fare <= 7.91:
        return 0
    elif fare <= 14.454:
        return 1
    elif fare <= 31:
        return 2
    else:
        return 3
    
train_data['Fare'] = train_data['Fare'].apply(lambda x: replace_fare(x))
test_data['Fare'] = test_data['Fare'].apply(lambda x: replace_fare(x))

In [None]:
#Embarked values to numeric
train_data['Embarked'] = train_data['Embarked'].apply(lambda x: 0 if x == 'S' else (1 if x == 'C' else 2))
test_data['Embarked'] = test_data['Embarked'].apply(lambda x: 0 if x == 'S' else (1 if x == 'C' else 2))

In [None]:
#Cabin values to categorical
train_data['Cabin'] = train_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
test_data['Cabin'] = test_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

In [None]:
#SibSp to FamilySize
train_data['SibSp'] = train_data['SibSp'].apply(lambda x: x + 1)
test_data['SibSp'] = test_data['SibSp'].apply(lambda x: x + 1)

train_data = train_data.rename(columns={'SibSp': 'FamilySize'})
test_data = test_data.rename(columns={'SibSp': 'FamilySize'})

In [None]:
#Adding isAlone feature
train_data['isAlone'] = train_data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
test_data['isAlone'] = test_data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

In [None]:
#Adding Title feature
def replace_name(name):
    if 'Mr.' in name:
        return 1
    elif 'Master' in name:
        return 2
    elif 'Mrs.' in name:
        return 3
    elif 'Miss' in name:
        return 4
    else:
        return 5

train_data['Title'] = train_data['Name'].apply(lambda x: replace_name(x))
test_data['Title'] = test_data['Name'].apply(lambda x: replace_name(x))

train_data.drop('Name', axis=1, inplace=True)
test_data.drop('Name', axis=1, inplace=True)

In [None]:
train_data.head()

In [None]:
for col in train_data.columns[1:]:
    sns.histplot(data=train_data, x=col, hue='Survived', discrete=True, multiple="dodge")
    plt.xticks(train_data[col].unique())
    plt.show()

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train_data.corr(),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
train_data.loc[:, ['Title', 'Survived']].groupby('Title').mean()

In [None]:
train_data.loc[:, ['Sex', 'Survived']].groupby('Sex').count()

In [None]:
labels = train_data.loc[:, 'Survived']
train_data.drop('Survived', axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.25, random_state=42)

In [None]:
classifiers = [LogisticRegression(), 
              KNeighborsClassifier(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              SVC()]

for classifier in classifiers:
    clf = classifier
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    print(f"Accuracy for {clf.__class__.__name__}: {accuracy}")
          
    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    disp.plot()
    plt.show()

In [None]:
classifiers = [LogisticRegression(),     
               KNeighborsClassifier(),              
               DecisionTreeClassifier(),              
               RandomForestClassifier(),              
               SVC()]

param_grid_lr = {'max_iter': [100, 250, 500, 1000, 2000],
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear']}

param_grid_knn = {'n_neighbors': [3,5,7,9],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['auto', 'ball_tree', 'kd_tree'],
                  'p': [1, 2],
                  'metric': ['euclidean', 'manhattan']}

param_grid_dc = {'criterion': ['gini', 'entropy'],
                 'max_depth': range(1, 11)}

param_grid_rf = {'criterion': ['gini', 'entropy'],
                 'max_depth': range(2, 6)}

param_grid_svc = [{'kernel': ['rbf'],
                   'gamma': [.1, .5, 1, 2, 5, 10],
                   'C': [.1, 1, 10, 100, 1000]},
                  {'kernel': ['linear'],
                   'C': [.1, 1, 10, 100, 1000]},
                  {'kernel': ['poly'],
                   'degree': [2, 3, 4, 5],
                   'C':[.1, 1, 10, 100, 1000]}]

grid_params = [param_grid_lr, param_grid_knn, param_grid_dc, param_grid_rf, param_grid_svc]

for idx, classifier in enumerate(classifiers):
    clf = classifier

    clf_gs = GridSearchCV(clf, param_grid=grid_params[idx], cv=10, verbose=True, n_jobs=-1, scoring='accuracy')
    clf_gs.fit(X_train, y_train)
    
    print(f'Classifier: {classifier.__class__.__name__}')
    print(f'Best score: {clf_gs.best_score_}')
    print(f'Best parameters: {clf_gs.best_params_}')
    print('-'*60)


In [None]:
#Best clf
clf = RandomForestClassifier(criterion='gini', max_depth=5)
clf.fit(train_data, labels)

In [None]:
submission_pred = clf.predict(test_data)

In [None]:
df = pd.DataFrame({'PassengerId': test_data.index.values,
                   'Survived': submission_pred})

In [None]:
df.to_csv('submission.csv', index=False)