In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


**Imports**

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Load data files

In [3]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.shape

(891, 12)

Descriptive Analytics (Stats + visuals)

In [4]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Data Preparation for Modeling

In [6]:
# Missing Value Treatment
df_train.Age = df_train.Age.fillna(df_train.Age.mean())
df_train.Embarked = df_train.Embarked.fillna(df_train.Embarked.mode())

In [7]:
df_train = df_train.astype({'Pclass': str})
df_train.Sex = np.where(df_train.Sex == 'male', 1, 0)
df_train = pd.get_dummies(df_train, columns=['Pclass','Embarked'])

In [8]:
X = df_train[list(set(df_train.columns) - set(['Cabin','Survived','PassengerId','Ticket','Name']))]
y = df_train[['Survived']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

Model Development & Selection

In [9]:
logit = LogisticRegression()
logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)

  y = column_or_1d(y, warn=True)


In [10]:
# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred)))
print('Recall Score : ' + str(recall_score(y_test,y_pred)))
print('F1 Score : ' + str(f1_score(y_test,y_pred)))

#Logistic Regression (Grid Search) Confusion matrix
metrics.confusion_matrix(y_test,y_pred)

Accuracy Score : 0.7798507462686567
Precision Score : 0.7916666666666666
Recall Score : 0.6608695652173913
F1 Score : 0.7203791469194312


array([[133,  20],
       [ 39,  76]])

In [11]:
df_test_raw = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test_raw.shape

(418, 11)

In [12]:
df_test = df_test_raw.copy()

In [13]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [14]:
# Missing Value Treatment
df_test.Age = df_test.Age.fillna(df_test.Age.mean())
df_test.Fare = df_test.Fare.fillna(df_test.Fare.mean())
df_test.Embarked = df_test.Embarked.fillna(df_test.Embarked.mode())

# Preparation for model application
df_test = df_test.astype({'Pclass': str})
df_test.Sex = np.where(df_test.Sex == 'male', 1, 0)
df_test = pd.get_dummies(df_test, columns=['Pclass','Embarked'])

df_test = df_test[list(set(df_test.columns).intersection(set(X_train.columns)))]

In [15]:
X_train.columns, df_test.columns

(Index(['Embarked_Q', 'Sex', 'SibSp', 'Embarked_S', 'Pclass_1', 'Pclass_3',
        'Fare', 'Embarked_C', 'Pclass_2', 'Age', 'Parch'],
       dtype='object'),
 Index(['Embarked_Q', 'Sex', 'SibSp', 'Embarked_S', 'Pclass_1', 'Pclass_3',
        'Fare', 'Embarked_C', 'Pclass_2', 'Age', 'Parch'],
       dtype='object'))

In [16]:
df_test_raw['Survived'] = logit.predict(df_test)

In [17]:
df_test_raw[['PassengerId','Survived']].to_csv('gender_submission.csv')