In [1]:
# Import Data Manipulation libraries
import pandas as pd
import numpy as np

In [2]:
#import data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#import warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
#import logging
import logging
logging.basicConfig(level=logging.INFO,
                    filename='model.log',
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
# Load the dataset

url = "https://raw.githubusercontent.com/Frisk516/EnE_TitanicModelBuilding/refs/heads/main/research/titanic_train.csv"

df=pd.read_csv(url)

df.sample(frac=1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
437,438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24.0,2,3,29106,18.7500,,S
832,833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C
185,186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50.0000,A32,S
616,617,0,3,"Danbom, Mr. Ernst Gilbert",male,34.0,1,1,347080,14.4000,,S
65,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
104,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.9250,,S
654,655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18.0,0,0,365226,6.7500,,Q
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0000,,S


In [6]:
#checking missing values

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# Dropping column cabin due to high percentage of missing values

df.drop(['Cabin'], axis=1, inplace=True)

In [8]:
#Splitting data into categorical and numerical

Numerical_Data=df.select_dtypes(exclude='object')

Categorical_Data=df.select_dtypes(include='object')

In [9]:
Categorical_Data

Unnamed: 0,Name,Sex,Ticket,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,S
4,"Allen, Mr. William Henry",male,373450,S
...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,S
887,"Graham, Miss. Margaret Edith",female,112053,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,S
889,"Behr, Mr. Karl Howell",male,111369,C


In [10]:
Numerical_Data

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.2500
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.9250
3,4,1,1,35.0,1,0,53.1000
4,5,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000
887,888,1,1,19.0,0,0,30.0000
888,889,0,3,,1,2,23.4500
889,890,1,1,26.0,0,0,30.0000


In [11]:
#Performing EDA

from collections import OrderedDict
stats=[]

for i in Numerical_Data:
    numerical_stats=OrderedDict({
    'Feature':i,
    'Missing Values': df[i].isnull().sum(),
    'Unique Values': df[i].nunique(),
    'Mean': df[i].mean(),
    'Median': df[i].median(),
    'Min': df[i].min(),
    'Max': df[i].max(),
    'Range': df[i].max() - df[i].min(),
    'Standard Deviation': df[i].std(),
    'Variance': df[i].var(),
    'Skewness': df[i].skew(),
    'Kurtosis': df[i].kurt(),
    'Count': df[i].count(),
    'Percentile 25': df[i].quantile(0.25),
    'Percentile 50': df[i].quantile(0.50),
    'Percentile 75': df[i].quantile(0.75),
    'IQR': df[i].quantile(0.75) - df[i].quantile(0.25)
        
})
    stats.append(numerical_stats)

df_stats =pd.DataFrame(stats)

df_stats

Unnamed: 0,Feature,Missing Values,Unique Values,Mean,Median,Min,Max,Range,Standard Deviation,Variance,Skewness,Kurtosis,Count,Percentile 25,Percentile 50,Percentile 75,IQR
0,PassengerId,0,891,446.0,446.0,1.0,891.0,890.0,257.353842,66231.0,0.0,-1.2,891,223.5,446.0,668.5,445.0
1,Survived,0,2,0.383838,0.0,0.0,1.0,1.0,0.486592,0.236772,0.478523,-1.775005,891,0.0,0.0,1.0,1.0
2,Pclass,0,3,2.308642,3.0,1.0,3.0,2.0,0.836071,0.699015,-0.630548,-1.280015,891,2.0,3.0,3.0,1.0
3,Age,177,88,29.699118,28.0,0.42,80.0,79.58,14.526497,211.019125,0.389108,0.178274,714,20.125,28.0,38.0,17.875
4,SibSp,0,7,0.523008,0.0,0.0,8.0,8.0,1.102743,1.216043,3.695352,17.88042,891,0.0,0.0,1.0,1.0
5,Parch,0,7,0.381594,0.0,0.0,6.0,6.0,0.806057,0.649728,2.749117,9.778125,891,0.0,0.0,0.0,0.0
6,Fare,0,248,32.204208,14.4542,0.0,512.3292,512.3292,49.693429,2469.436846,4.787317,33.398141,891,7.9104,14.4542,31.0,23.0896


In [12]:
# Checking Categorical_Data

for i in Categorical_Data:
    print(Categorical_Data[i].value_counts())
    print('*'*40)

Name
Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: count, Length: 891, dtype: int64
****************************************
Sex
male      577
female    314
Name: count, dtype: int64
****************************************
Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64
****************************************
Embarked
S    644
C    168
Q     77
Name: count, dtype: int64
*******

In [13]:
# checking target column

df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [14]:
#encoding sex column

df['Sex']=df['Sex'].replace({'female':0,'male':1})

In [15]:
Numerical_Data

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.2500
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.9250
3,4,1,1,35.0,1,0,53.1000
4,5,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000
887,888,1,1,19.0,0,0,30.0000
888,889,0,3,,1,2,23.4500
889,890,1,1,26.0,0,0,30.0000


In [16]:
#imputing missing values with mean

for i in Numerical_Data:
    df[i].fillna(df[i].mean(), inplace=True)

In [17]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [18]:
df.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [19]:
#Imputing missing values with mode

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [20]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [21]:
# encoding ticket column with label encoding

from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

df['Ticket']=le.fit_transform(df['Ticket'])

In [22]:
#encoding name column with label encoding

df['Name']=le.fit_transform(df['Name'])

In [23]:
# encoding embarked column with label encoding

df['Embarked']=le.fit_transform(df['Embarked'])

In [24]:
df.drop(columns=['PassengerId'], inplace=True)
df.drop(columns=['Ticket'], inplace=True)
df.drop(columns=['Name'], inplace=True)
df.drop(columns='Embarked',inplace=True)
df.drop(columns=['Parch','SibSp'],inplace=True)

In [25]:






X=df.drop(columns='Survived')

X             

y=df['Survived']

y             


#Split The Dataset Into Train and Test


from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)
X_train





array([[0.        , 1.        , 0.04498618, 0.15977676],
       [1.        , 1.        , 0.36792055, 0.01541158],
       [1.        , 0.        , 0.00728826, 0.02173075],
       ...,
       [1.        , 1.        , 0.50992712, 0.02753757],
       [0.        , 0.        , 0.17064589, 0.2342244 ],
       [0.        , 1.        , 0.25860769, 0.15085515]])

In [26]:
# using SMOTE to Balance Data

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)



In [27]:
# checking the values in sex column after SMOTE

pd.Series(y_train).value_counts()



Survived
1    392
0    392
Name: count, dtype: int64

In [28]:
#Using Random Forest Classifier for prediction

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

RF=RandomForestClassifier()

RF.fit(X_train,y_train)

y_pred_RF=RF.predict(X_test)

accuracy_score_RF = accuracy_score(y_test,y_pred_RF)
print(f'The Accuracy Score is ',accuracy_score_RF)

The Accuracy Score is  0.8022388059701493


In [29]:
#Using Adaboost Classifier for prediction

from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score

ADA=AdaBoostClassifier()

ADA.fit(X_train,y_train)

y_pred_ADA=ADA.predict(X_test)

accuracy_score_ADA = accuracy_score(y_test,y_pred_ADA)
print(f'The Accuracy Score is ',accuracy_score_ADA)

The Accuracy Score is  0.7761194029850746


In [30]:
#using Logistic regression for prediction
from sklearn.linear_model import LogisticRegression



LR=LogisticRegression()



LR.fit(X_train,y_train)



y_pred=LR.predict(X_test)


from sklearn.metrics import accuracy_score,classification_report

accuracy_score_LR=accuracy_score(y_test,y_pred)
accuracy_score_LR

0.7985074626865671

In [31]:
#using decision tree classifier for prediction

from sklearn.tree import DecisionTreeClassifier

DT=DecisionTreeClassifier()

DT.fit(X_train,y_train)

y_pred_DT=DT.predict(X_test)

accuracy_score_DT=accuracy_score(y_test,y_pred_DT)

accuracy_score_DT

0.7611940298507462

In [36]:
#using gridsearchcv for improving model accuracy

# Define model and parameter grid
model=RandomForestClassifier(random_state=42)
param_grid = {'n_estimators': [25,100,200,300,400,500,600,650,700,800,900,1000,2000,4000],'max_depth': [None, 10,20,30,40,50]}


from sklearn.model_selection import GridSearchCV

#Grid Search Cv
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Model
best_model = grid_search.best_estimator_
y_pred=best_model.predict(X_test)

#print result
print('Best Parameters:', grid_search.best_params_)
print('accuracy_score:', accuracy_score(y_test, y_pred))

Best Parameters: {'max_depth': 10, 'n_estimators': 600}
accuracy_score: 0.8059701492537313
