In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [57]:
# Load Titanic dataset
url = "titanic.csv"
titanic_data = pd.read_csv(url)

# Explore the data
print(titanic_data.head())
print(titanic_data.info())


   Survived  Pclass                                               Name   
0         0       3                             Mr. Owen Harris Braund  \
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8

In [58]:
titanic_data['Title'] = titanic_data['Name'].apply(lambda x: re.search(r'\b(\w+\.)', x).group())


In [59]:
selected_features = ['Pclass', 'Fare', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Title', 'Survived']
titanic_data = titanic_data[selected_features]

In [60]:
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)

# Convert categorical variables to numerical
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Title'], drop_first=True)

In [61]:
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [63]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.76


In [64]:
survival_probabilities = model.predict_proba(X_test)[:, 1]

# Display survival probabilities along with actual survival status
results = pd.DataFrame({'Survived': y_test, 'Survival Probability': survival_probabilities})
print(results.head())

     Survived  Survival Probability
296         1                  0.27
682         0                  0.03
535         0                  0.04
644         1                  0.46
623         0                  0.04
