In [1]:
# 1.import required packages
import pandas as pd
# 2. Load the Dataset
import pandas as pd
data = pd.read_csv('titanic.csv')
print(data)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [3]:
 # 3.Handle Missing Values
import scipy.linalg 
from sklearn.impute import SimpleImputer
# Fill missing values for numerical columns
imputer_num = SimpleImputer(strategy='median')
data[['Age', 'Fare']] = imputer_num.fit_transform(data[['Age', 'Fare']])
# Fill missing values for categorical columns
imputer_cat = SimpleImputer(strategy='most_frequent')
data[['Embarked']] = imputer_cat.fit_transform(data[['Embarked']])
print(data[['Embarked']])

    Embarked
0          S
1          C
2          S
3          S
4          S
..       ...
886        S
887        S
888        S
889        C
890        Q

[891 rows x 1 columns]


In [5]:
# 4. Encode Categorical Variables
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
# 5. Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])


In [7]:
# 6.Step1. Split the Data
from sklearn.model_selection import train_test_split
X = data.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'])
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 2: Model Implementation and Evaluation
# Technique 1. Logistic Regression classifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
# Train the model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
# Evaluate the model
y_pred = log_reg.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1]))

Logistic Regression
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

ROC-AUC: 0.8828828828828829


In [9]:
# Technique 2. Decision Tree classifier 

from sklearn.tree import DecisionTreeClassifier
# Train the model
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
# Evaluate the model
y_pred = tree.predict(X_test)
print("Decision Tree")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1]))


Decision Tree
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

ROC-AUC: 0.8160875160875162


In [11]:
# Technique 3 Random Forest
from sklearn.ensemble import RandomForestClassifier
# Train the model
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
# Evaluate the model
y_pred = forest.predict(X_test)
print("Random Forest")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, forest.predict_proba(X_test)[:, 1]))

Random Forest
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

ROC-AUC: 0.8933075933075932


In [15]:
# Hyperparameter Tuning for optimization
# Use GridSearchCV or RandomizedSearchCV for hyperparameter tuning.
from sklearn.model_selection import GridSearchCV
param_grid = {
 'C': [0.1, 1, 10, 100],
 'solver': ['liblinear', 'saga']
}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=10)
grid.fit(X_train, y_train)
print("Best parameters for Logistic Regression:", grid.best_params_)

Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'saga'}
