In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
train_data = pd.read_csv('Datasets/train.csv')
test_data = pd.read_csv('Datasets/test.csv')

# Print the first 5 rows of the training data
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
# Get a list of all the features in the training data
features = train_data.columns.tolist()
print(features)

# Print the number of passengers who survived and died
print(train_data['Survived'].value_counts())
print(train_data['Sex'].value_counts())

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
0    549
1    342
Name: Survived, dtype: int64
male      577
female    314
Name: Sex, dtype: int64


In [3]:
# Replace missing values in the 'Age' feature with the mean age
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())

print(train_data.isna().sum())
print(test_data.isna().sum())

# Encode categorical variables
sex_mapping = {'male': 0, 'female': 1}
train_data['Sex'] = train_data['Sex'].map(sex_mapping)
test_data['Sex'] = test_data['Sex'].map(sex_mapping)


# Select features
features = ['Pclass', 'Sex', 'Age', 'Fare']
target = 'Survived'
X_train = train_data[features]
y_train = train_data[target]
#X_test = test_data[features]
#y_test = test_data[target]


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [4]:
#import train_test_split
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
#X_test = scaler.transform(X_test)



In [5]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make evaluations on the test set
y_predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, y_predictions)

print(f'Accuracy: {accuracy}')
print(classification_report(y_val, y_predictions))


Accuracy: 0.7877094972067039
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       105
           1       0.76      0.70      0.73        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [6]:
# Make prediction on the test set
X_test = scaler.transform(test_data[features])
test_predictions = model.predict(X_test)

# To Summarize My Model's Performance Above

I trained a Random Forest Classifier model and it achieved an overall accuracy of just shy of 80% when evaluated on a validation set. This means that my model correctly predicts the survival status of passengers in the Titanic most times.
Let's observe some notable metrics;
- **Accuracy:** 78.77%. Although accuracy alone may not provide a complete picture of the model's effectiveness.

- **Precision and Recall:** It's essential to consider precision, recall, and F1-score to assess the model's performance more comprehensively. In this context:
  - For passengers who did not survive (0), the model achieved a precision of 0.80, meaning that 80% of the predictions for this class were correct. The recall for (0) was 0.85, indicating that the model correctly identified 85% of passengers who did not survive.
  - For passengers who survived (1), the model achieved a precision of 0.76, indicating that 76% of the predictions for this class were correct. The recall for (1) was 0.70, meaning that the model correctly identified 70% of passengers who survived.

I can say my model performed reasonably balanced for an almost baseline model without any model tuning.