In [62]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np

# Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# warnings library
import warnings
warnings.filterwarnings('ignore')

# logging library
import logging
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)s]: %(message)s:',
                    filemode='w',
                    filename='app.log',
                    force=True)

In [63]:
# Load the dataset
url = "https://raw.githubusercontent.com/Digraskarpratik/EnE_TitanicModelBuilding/refs/heads/main/research/titanic_train.csv"

df = pd.read_csv(url)
df.sample(frac=1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
235,236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.5500,,S
677,678,1,3,"Turja, Miss. Anna Sofia",female,18.0,0,0,4138,9.8417,,S
774,775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0000,,S
726,727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30.0,3,0,31027,21.0000,,S
658,659,0,2,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
867,868,0,1,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,PC 17590,50.4958,A24,S
530,531,1,2,"Quick, Miss. Phyllis May",female,2.0,1,1,26360,26.0000,,S
104,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.9250,,S
578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C


In [64]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [65]:
df.drop(["Cabin"], axis=1, inplace=True)
df.drop(["Name"], axis=1, inplace=True)
df.drop(["Ticket"], axis=1, inplace=True)
df.drop(["Age"], axis=1, inplace=True)
df.drop(["PassengerId"], axis=1, inplace=True)
df.drop(["SibSp"], axis=1, inplace=True)
df.drop(["Parch"], axis=1, inplace=True)
df.drop(["Embarked"], axis=1, inplace=True)

In [66]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Fare        0
dtype: int64

In [68]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,male,7.25
1,1,1,female,71.2833
2,1,3,female,7.925
3,1,1,female,53.1
4,0,3,male,8.05


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Fare      891 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 28.0+ KB


In [70]:
Numerical = df.select_dtypes(exclude= "object")
Character = df.select_dtypes(include= "object")

In [71]:
Numerical

Unnamed: 0,Survived,Pclass,Fare
0,0,3,7.2500
1,1,1,71.2833
2,1,3,7.9250
3,1,1,53.1000
4,0,3,8.0500
...,...,...,...
886,0,2,13.0000
887,1,1,30.0000
888,0,3,23.4500
889,1,1,30.0000


In [72]:
Numerical.corr()

Unnamed: 0,Survived,Pclass,Fare
Survived,1.0,-0.338481,0.257307
Pclass,-0.338481,1.0,-0.5495
Fare,0.257307,-0.5495,1.0


In [73]:
Character

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male
...,...
886,male
887,female
888,female
889,male


In [74]:
Character["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [75]:
Character

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male
...,...
886,male
887,female
888,female
889,male


In [76]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])

In [77]:
Numerical.corr()

Unnamed: 0,Survived,Pclass,Fare
Survived,1.0,-0.338481,0.257307
Pclass,-0.338481,1.0,-0.5495
Fare,0.257307,-0.5495,1.0


In [78]:
from collections import OrderedDict

stats = []

# Define the order of features for correlation matrix
for i in Numerical.columns:
    Numerical_stats = OrderedDict([
    ("feature", i),
    ("Mean", Numerical[i].mean()),
    ("Median", Numerical[i].median()),
    ("Mode", Numerical[i].mode()[0]),
    ("Variance", Numerical[i].var()),
    ("Skewness", Numerical[i].skew()),
    ("Kurtosis", Numerical[i].kurt()),
    ("IOR", Numerical[i].quantile(0.75) - Numerical[i].quantile(0.25))
    ])
    
    stats.append(Numerical_stats)

df_stats = pd.DataFrame(stats)

df_stats.style.background_gradient(subset=["Mean", "Median", "Mode", "Variance", "Skewness", "Kurtosis", "IOR"], cmap="coolwarm")


Unnamed: 0,feature,Mean,Median,Mode,Variance,Skewness,Kurtosis,IOR
0,Survived,0.383838,0.0,0.0,0.236772,0.478523,-1.775005,1.0
1,Pclass,2.308642,3.0,3.0,0.699015,-0.630548,-1.280015,1.0
2,Fare,32.204208,14.4542,8.05,2469.436846,4.787317,33.398141,23.0896


In [79]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Scaler = MinMaxScaler()
X_train = Scaler.fit_transform(X_train)
X_test = Scaler.transform(X_test)


In [80]:
from imblearn.over_sampling import SMOTE
sm =  SMOTE(random_state=42)
X_train,y_train= sm.fit_resample(X_train, y_train)

In [81]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

DC = DecisionTreeClassifier()

DC.fit(X_train, y_train)

y_pred_DC = DC.predict(X_test)

accuracy_Score_DC = accuracy_score(y_test, y_pred_DC)

print(f"Accuracy Score for Decision Tree Classifier: {round(accuracy_Score_DC*100)}%")


Accuracy Score for Decision Tree Classifier: 83%


In [82]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

RC = RandomForestClassifier()

RC.fit(X_train, y_train)

y_pred_RC = RC.predict(X_test)

accuracy_score_RC = accuracy_score(y_test, y_pred_RC)

print(f"Accuracy Score for Random Forest Classifier: {round(accuracy_score_RC*100)}%")

Accuracy Score for Random Forest Classifier: 81%


In [86]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

AD = AdaBoostClassifier()

AD.fit(X_train, y_train)

y_pred_AD = AD.predict(X_test)

accuracy_score_AD = accuracy_score(y_test, y_pred_AD)

print(f"Accuracy Score for AdaBoostClassifier: {round(accuracy_score_AD*100)}%")

Accuracy Score for AdaBoostClassifier: 78%


In [84]:
# Grid Search CV for Improving Model Accuracy

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#Define the parameter grid

GSC =GridSearchCV(estimator= RandomForestClassifier(random_state=42), 
             param_grid={"n_estimators": [100, 150, 200, 300, 400, 500, 600, 700],
             "max_depth": [None,100,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000]}, cv= 5, verbose=2, n_jobs=-1)

# define Grid Search

GSC.fit(X_train, y_train)

gs = GSC.best_estimator_

y_pred_GS = gs.predict(X_test)

# print Results
print(f"Best Parameters: {GSC.best_params_}")
print(f"Accuracy Score for Grid SearchCV: {round(accuracy_score(y_test, y_pred_GS)*100)}%")


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Parameters: {'max_depth': None, 'n_estimators': 200}
Accuracy Score for Grid SearchCV: 80%


In [None]:
# Grid Search CV for Improving Model Accuracy

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
#Define the parameter grid

GSC =GridSearchCV(estimator= DecisionTreeClassifier(criterion = "gini", splitter= "best", max_depth=[])
    
)

# define Grid Search

GSC.fit(X_train, y_train)

gs = GSC.best_estimator_

y_pred_GS = gs.predict(X_test)

# print Results
print(f"Best Parameters: {GSC.best_params_}")
print(f"Accuracy Score for Grid SearchCV: {round(accuracy_score(y_test, y_pred_GS)*100)}%")


TypeError: GridSearchCV.__init__() got an unexpected keyword argument 'criterion'

In [85]:
df

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,1,7.2500
1,1,1,0,71.2833
2,1,3,0,7.9250
3,1,1,0,53.1000
4,0,3,1,8.0500
...,...,...,...,...
886,0,2,1,13.0000
887,1,1,0,30.0000
888,0,3,0,23.4500
889,1,1,1,30.0000
