In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

#Load the dataset
df = pd.read_csv('/content/archive (3).zip')
# drop the not require columns from the dataset
df_clean = df.drop(columns=['Cabin','Name','Ticket'])
#fill the missing in 'Age' and 'Fare' with medians
imputer = SimpleImputer(strategy='median')
df_clean[['Age', 'Fare']] = imputer.fit_transform(df_clean[['Age', 'Fare']])
#encode 'sex' (male=1, female=0)
df_clean['Sex'] = df_clean['Sex'].map({'male': 1, 'female': 0})
#one-hot encode 'Embarked'
df_clean = pd.get_dummies(df_clean, columns=['Embarked'])
#normalize the numerical values
scaler = MinMaxScaler()
df_clean[['Age', 'Fare']] = scaler.fit_transform(df_clean[['Age', 'Fare']])
#defining feature and target variables
X = df_clean.drop(columns=[ 'PassengerId','Survived'])
y = df_clean['Survived']
#split the data into trainig and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the RandomForest Classifier
rf_classifier = RandomForestRegressor(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
#make predictions on the test set
y_pred = rf_classifier.predict(X_test)
#evaluting the model performance
accuracy = accuracy_score(y_test, y_pred.round())
precision_score(y_test, y_pred.round())
recall_score(y_test, y_pred.round())
f1_score(y_test, y_pred.round())
# print the evalution metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision_score(y_test, y_pred.round())}')
print(f'Recall: {recall_score(y_test, y_pred.round())}')
print(f'F1 Score: {f1_score(y_test, y_pred.round())}')


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
