In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression

## IMPORT CSV FILE

In [2]:
ceci_data = pd.read_csv("titanic.csv")
ceci_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Exploratory Data Analysis

In [3]:
ceci_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
ceci_data.drop('Cabin', inplace=True, axis='columns') # it has most null values so has to be dropped

## Categorical data


In [5]:
for col in ceci_data:
    print(col, ceci_data[col].dtypes)


PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Embarked object


## Specifying Categrical and Numerical Data

In [6]:
categorical_data = ['Name', 'Sex', 'Ticket', 'Embarked']

numerical_data = [col for col in ceci_data if col not in categorical_data]

## Instead of dropping columns with missing data , lets fill them


In [7]:
#for numerical dat -mean

ceci_data[numerical_data] = ceci_data[numerical_data].fillna(ceci_data[numerical_data].mean())

In [8]:
#for categorical data - mode

for col in categorical_data:
    if(ceci_data[col].isna().sum() > 0):
        ceci_data[col] = ceci_data[col].fillna(ceci_data[col].mode()[0])

## Verifying if nulls still exist

In [9]:
ceci_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## Encoding

In [10]:
ceci_data = pd.concat([ceci_data, pd.get_dummies(ceci_data[['Name', 'Ticket']])], axis=1)
ceci_data.drop(['Name', 'Ticket'], inplace=True, axis='columns')

In [11]:
#using ordinal encounter for categorical data like 'Sex' & ' Embarked'
cols = ['Sex', 'Embarked']

ceci_data[cols] = OrdinalEncoder().fit_transform(ceci_data[cols])
ceci_data[cols].head()

Unnamed: 0,Sex,Embarked
0,1.0,2.0
1,0.0,0.0
2,0.0,2.0
3,0.0,2.0
4,1.0,2.0


In [12]:
#check the data types now minus obvoiusly the One Hot Encoded data

available_Cols = ['Sex', 'Embarked', 'PassengerId',
                          'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
                          'Embarked']

for col in available_Cols:
    print(col, ceci_data[col].dtypes)

Sex float64
Embarked float64
PassengerId int64
Survived int64
Pclass int64
Age float64
SibSp int64
Parch int64
Fare float64
Embarked float64


# Model Training

In [13]:
X = ceci_data.drop('Survived', axis = 1)
y = ceci_data['Survived']

In [14]:
X[:3]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward",...,Ticket_STON/O2. 3101290,Ticket_SW/PP 751,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735
0,1,3,1.0,22.0,1,0,7.25,2.0,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,1,0.0,38.0,1,0,71.2833,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,3,0.0,26.0,0,0,7.925,2.0,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=60)

In [16]:
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((668, 1580), (223, 1580), (668,), (223,))

In [17]:
lr = LogisticRegression().fit(Xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
lr.score(Xtrain, ytrain)

0.8802395209580839

In [19]:
lr.score(Xtest, ytest)

0.7668161434977578