# Titanic - Evaluation and Interpretation
## Objective
In this section, we will evaluate the performance of our trained models using
various metrics such as confusion matrix, classification report, and ROC curves.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

train_df = pd.read_csv('train_cleaned.csv')

features = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs']
X = train_df[features]
y = train_df['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print('Data split into training and validation sets.')

## Model Training
We will train both Logistic Regression and Random Forest models to evaluate their performance.

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_log = log_reg.predict(X_val)
y_pred_rf = rf.predict(X_val)

y_proba_log = log_reg.predict_proba(X_val)[:, 1]
y_proba_rf = rf.predict_proba(X_val)[:, 1]

## Evaluation Metrics
We will evaluate the models using accuracy, confusion matrix, and ROC curves.

In [None]:
conf_matrix_rf = confusion_matrix(y_val, y_pred_rf)
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.show()

print('Random Forest Classification Report:')
print(classification_report(y_val, y_pred_rf))

## ROC Curve
We will plot the ROC curve to analyze the model's performance across different thresholds.

In [None]:
fpr_log, tpr_log, _ = roc_curve(y_val, y_proba_log)
fpr_rf, tpr_rf, _ = roc_curve(y_val, y_proba_rf)

plt.plot(fpr_log, tpr_log, label='Logistic Regression (AUC = %0.2f)' % roc_auc_score(y_val, y_proba_log))
plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = %0.2f)' % roc_auc_score(y_val, y_proba_rf))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

## Conclusion
- The confusion matrix helps in understanding false positives and negatives.
- Classification reports provide precision, recall, and F1-score.
- ROC curves give an insight into the trade-off between sensitivity and specificity.