In [1]:
import pandas as pd

dataset = pd.read_csv('train.csv')
dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [2]:
def dataset_info(dataset):
    info = pd.DataFrame()
    info['Name'] = dataset.columns
    info['Type'] = dataset.dtypes.values
    info['Unique'] = dataset.nunique().values
    info['Missing'] = dataset.isnull().sum().values
    
    return info

dataset_info(dataset)

Unnamed: 0,Name,Type,Unique,Missing
0,PassengerId,int64,891,0
1,Survived,int64,2,0
2,Pclass,int64,3,0
3,Name,object,891,0
4,Sex,object,2,0
5,Age,float64,88,177
6,SibSp,int64,7,0
7,Parch,int64,7,0
8,Ticket,object,681,0
9,Fare,float64,248,0


In [3]:
nominal = ['Sex', 'Ticket', 'Embarked']
numerical = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

X = dataset[nominal + numerical]
y = dataset['Survived']

dataset_info(X)

Unnamed: 0,Name,Type,Unique,Missing
0,Sex,object,2,0
1,Ticket,object,681,0
2,Embarked,object,3,2
3,Pclass,int64,3,0
4,Age,float64,88,177
5,SibSp,int64,7,0
6,Parch,int64,7,0
7,Fare,float64,248,0


In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [5]:
nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('feature_scale', StandardScaler())
])

full_preprocessor = ColumnTransformer([
    ('nominal_transformer', nominal_pipeline, nominal),
    ('numerical_transformer', numerical_pipeline, numerical)
])

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [7]:
# split data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
# try Logistic Regression
from sklearn.linear_model import LogisticRegression

classifier = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', LogisticRegression())
])
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.8097014925373134
0.7384615384615385


array([[145,  23],
       [ 28,  72]])

In [9]:
# try SVM
from sklearn.svm import SVC

classifier = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', SVC())
])
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.8134328358208955
0.7422680412371133


array([[146,  22],
       [ 28,  72]])

In [10]:
# try Random Forest
from sklearn.ensemble import RandomForestClassifier

classifier = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.8544776119402985
0.7979274611398963


array([[152,  16],
       [ 23,  77]])

In [11]:
# try Naive Bayes
from sklearn.naive_bayes import GaussianNB

classifier = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', GaussianNB())
])
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.44029850746268656
0.5454545454545454


array([[ 28, 140],
       [ 10,  90]])

In [12]:
# It's clear that Random Forest is the most promising
classifier = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

scores = cross_val_score(classifier, X, y, cv=5)
print(scores.mean())

0.8394890465130878


In [22]:
# Tuning hyperparameters to increase model performance
from sklearn.model_selection import GridSearchCV

grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__n_estimators': [n for n in range(50, 1001, 50)]
}

classifier = full_pipeline = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

searcher = GridSearchCV(classifier, grid, cv=5, n_jobs=-1)
searcher.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('nominal_transformer',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('encode',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['Sex',
                                                                          'Ticket',
                                                                          'Embar

In [14]:
searcher.best_score_

0.8394890465130878

In [15]:
searcher.best_params_

{'model__criterion': 'gini', 'model__n_estimators': 100}

In [25]:
classifier = Pipeline([
    ('preprocessor', full_preprocessor),
    ('model', RandomForestClassifier(random_state=42, n_estimators=200))
])

classifier.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('nominal_transformer',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encode',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Sex', 'Ticket',
                                                   'Embarked']),
                                                 ('numerical_transformer',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                      

In [26]:
# submit to kaggle
test_set = pd.read_csv('test.csv')
test_set = test_set[nominal + numerical]

submission = pd.read_csv('gender_submission.csv')
submission['Survived'] = classifier.predict(test_set)
submission.to_csv('results.csv', index=False)