In [56]:
# DATA LOADING

# Importing Necessary Libraries
import pandas as pd

# Creating dataframes for training and testing data by taking raw URL from Github
train_data = pd.read_csv("https://raw.githubusercontent.com/Arsath-R/Titanic-Survival-Prediction/refs/heads/main/train.csv")
test_data = pd.read_csv("https://raw.githubusercontent.com/Arsath-R/Titanic-Survival-Prediction/refs/heads/main/test.csv")

# Displaying the head for training data
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [57]:
# DATA EXPLORATION

# Getting the basic inforamtion of data
train_data.info()

# Getting the essential info of data such as mean, min, max...
train_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [58]:
# Checking for number of Null values in every columns
train_data.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [59]:
# Checking how many people are survived and not survived
train_data["Survived"].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,549
1,342


In [60]:
# DATA CLEANING AND PREPROCESSING

# Removing 'Cabin' column since it have too many NULL values
train_data.drop("Cabin", axis = 1, inplace = True)

# Filling Age column's missing value with it's median
train_data.fillna(train_data['Age'].median(), inplace = True)

# Filling the missing entries in embarked column with it's frequent value
train_data.fillna(train_data['Embarked'].mode()[0], inplace = True)

# Dropping the unecessary features for this application
train_data.drop(["Name","Ticket", "PassengerId"], axis = 1, inplace = True)

# Converting Sex and Embarked column values to numeric
train_data = pd.get_dummies(train_data, columns = ['Sex', 'Embarked'], drop_first = True)


# Displaying the head of Cleaned dataset
train_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,True,False,False,True
1,1,1,38.0,1,0,71.2833,False,True,False,False
2,1,3,26.0,0,0,7.925,False,False,False,True
3,1,1,35.0,1,0,53.1,False,False,False,True
4,0,3,35.0,0,0,8.05,True,False,False,True


In [61]:
# FEATURE AND TARGET PREPARATION

# Creating a new column to store total family memebers by adding siblings and parents, childeren columns
train_data['FamilyMembers'] = train_data['SibSp'] + train_data['Parch']
train_data.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

# Implementing Training and Testing data
X = train_data.drop("Survived", axis = 1)
y = train_data['Survived']

# Importing train_test_split function from sklearn
from sklearn.model_selection import train_test_split

# Spliting the data into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [62]:
# MODEL CREATION AND TRAINING

# Importing LogisticRegression from sklearn and implementing model object
from sklearn.linear_model import LogisticRegression

LRmodel = LogisticRegression(max_iter = 500)
# Training the model
LRmodel.fit(X_train, y_train)

# Model prediction
y_pred = LRmodel.predict(X_test)

In [63]:
# MODEL EVALUATION

# Importing evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy
print("Model's Accuracy : ", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix : ", confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report : ", classification_report(y_test, y_pred))

Model's Accuracy :  0.8044692737430168

Confusion Matrix :  [[90 15]
 [20 54]]

Classification Report :                precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



In [64]:
# TEST DATA CLEANING AND PREPROCESSING

# Removing 'Cabin' column since it have too many NULL values
test_data.drop("Cabin", axis = 1, inplace = True)

# Filling Age column's missing value with it's median
test_data.fillna(test_data['Age'].median(), inplace = True)

# Filling the missing entries in embarked column with it's frequent value
test_data.fillna(test_data['Embarked'].mode()[0], inplace = True)

# Dropping the unecessary features for this application
test_data.drop(["Name","Ticket", "PassengerId"], axis = 1, inplace = True)

# Converting Sex and Embarked column values to numeric
test_data = pd.get_dummies(test_data, columns = ['Sex'], drop_first = True)
test_data = pd.get_dummies(test_data, columns = ['Embarked'])

# Displaying the head of Cleaned dataset
test_data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,True,False,True,False
1,3,47.0,1,0,7.0,False,False,False,True
2,2,62.0,0,0,9.6875,True,False,True,False
3,3,27.0,0,0,8.6625,True,False,False,True
4,3,22.0,1,1,12.2875,False,False,False,True


In [65]:
# FEATURE AND TARGET PREPARATION

# Creating a new column to store total family memebers by adding siblings and parents, childeren columns
test_data['FamilyMembers'] = test_data['SibSp'] + test_data['Parch']
test_data.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

In [67]:
# MODEL EVALUATION WITH UNSEEN DATA

# Test data prediction
test_y_pred = LRmodel.predict(test_data)

# Adding result column
test_data['Survived'] = test_y_pred
test_data.head(20)

Unnamed: 0,Pclass,Age,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S,FamilyMembers,Survived
0,3,34.5,7.8292,True,False,True,False,0,0
1,3,47.0,7.0,False,False,False,True,1,0
2,2,62.0,9.6875,True,False,True,False,0,0
3,3,27.0,8.6625,True,False,False,True,0,0
4,3,22.0,12.2875,False,False,False,True,2,1
5,3,14.0,9.225,True,False,False,True,0,0
6,3,30.0,7.6292,False,False,True,False,0,1
7,2,26.0,29.0,True,False,False,True,2,0
8,3,18.0,7.2292,False,True,False,False,0,1
9,3,21.0,24.15,True,False,False,True,2,0
