# Predicting Survival of a Titanic Passenger
# Using Logistic Regression

In [86]:
# importing pandas libray for CSV file I/O
import pandas as pd
# Importing the dataset
df = pd.read_csv("Titanic.csv")

In [87]:
print('First row of the dataset: ')
print(df.head(1))

First row of the dataset: 
   PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  \
0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1   

   Parch     Ticket  Fare Cabin Embarked  
0      0  A/5 21171  7.25   NaN        S  


In [88]:
# Extract dependent and independent features
X = df.drop('Survived', axis = 1)
y = df['Survived']

In [89]:
# Dropping the non-relevant features
X.drop(['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

In [90]:
print('Shape of the Data frame: ', df.shape)

Shape of the Data frame:  (891, 12)


In [91]:
print("The summary of dataset: ")
print(df.info())

The summary of dataset: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [92]:
print("Check for missing values: ")
print(df.isnull().sum())

Check for missing values: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [93]:
# There are missing values in Age and Embarked
# numeric value imputation with mean
X['Age'] = X['Age'].fillna(X['Age'].mean())
# categorical value imputation with mode
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

In [94]:
# Category one-hot encoding for Sex and Embarked
X = pd.get_dummies(X, columns = ['Sex', 'Embarked'],
				prefix = ['Sex', 'Embarked'],
				drop_first = True)

In [95]:
# Splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=9)

In [96]:
print('Shape of Training set of features: ', X_train.shape)
print('Shape of Testing set of features: ', X_test.shape)
print('Shape of Training set of target: ', y_train.shape)
print('Shape of Testing set of target: ', y_test.shape)

Shape of Training set of features:  (623, 9)
Shape of Testing set of features:  (268, 9)
Shape of Training set of target:  (623,)
Shape of Testing set of target:  (268,)


In [97]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# fit() calculates the mean and standard deviation of X
scaler.fit(X)

StandardScaler()

In [98]:
# transform() fits the data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [99]:
# Importing KNN Classifier
from sklearn.linear_model import LogisticRegression
# Instantiating the classifier
lr = LogisticRegression()
# Fitting classifier to training set
lr.fit(X_train, y_train)

LogisticRegression()

In [100]:
# make predictions on the test set
y_pred = lr.predict(X_test)

In [101]:
# importing metrics to calculate accuracy score
from sklearn import metrics
print("The accuracy score is: ", metrics.accuracy_score(y_test, y_pred))

The accuracy score is:  0.8059701492537313


In [102]:
# Classification Report
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       162
           1       0.78      0.71      0.74       106

    accuracy                           0.81       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.81      0.80       268



In [103]:
# Confusion Matrix
print(metrics.confusion_matrix(y_test, y_pred))

[[141  21]
 [ 31  75]]
