## Model selection and baseline model

### 
We need to choose appropriate models for this multi-class classification task. Starting with a simple baseline model (e.g., Logistic Regression, Naive Bayes) to establish a performance benchmark.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define features (X) and target (y)
X = df_processed.drop(columns=['role', 'index'])
y = df_processed['role']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1-score (weighted): {f1:.4f}")

## Improved model experimentation

### 
Experiment with more advanced models (e.g., RandomForest, Gradient Boosting, or potentially neural networks for the text data) to try and improve upon the baseline.


Initialize, train, and evaluate a RandomForestClassifier model to compare its performance against the baseline Logistic Regression model.



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a RandomForestClassifier model
random_forest_model = RandomForestClassifier(random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

# Print evaluation metrics
print("Random Forest Model Performance:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision (weighted): {precision_rf:.4f}")
print(f"Recall (weighted): {recall_rf:.4f}")
print(f"F1-score (weighted): {f1_rf:.4f}")

# Compare with baseline Logistic Regression
print("\nBaseline Logistic Regression Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1-score (weighted): {f1:.4f}")

Initialize, train, and evaluate an XGBoost model, then print its performance metrics alongside the Random Forest metrics for comparison.



In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize an XGBClassifier model
xgboost_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Train the model with encoded labels
xgboost_model.fit(X_train, y_train_encoded)

# Make predictions with encoded labels
y_pred_xgb_encoded = xgboost_model.predict(X_test)

# Decode the predictions back to original labels for evaluation
y_pred_xgb = label_encoder.inverse_transform(y_pred_xgb_encoded)

# Evaluate the model using original labels
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

# Print evaluation metrics
print("XGBoost Model Performance:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"Precision (weighted): {precision_xgb:.4f}")
print(f"Recall (weighted): {recall_xgb:.4f}")
print(f"F1-score (weighted): {f1_xgb:.4f}")

# Compare with RandomForest
print("\nRandom Forest Model Performance:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision (weighted): {precision_rf:.4f}")
print(f"Recall (weighted): {recall_rf:.4f}")
print(f"F1-score (weighted): {f1_rf:.4f}")

## Hyperparameter Tuning 

### 
We first split the training data, define parameter grids for RandomForest and XGBoost, perform GridSearchCV for both models, print the best parameters, and evaluate the best models on the validation set.



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# Ensure y_train is available (it was defined in a previous step)
if 'y_train' not in globals():
    print("Error: y_train not found. Please ensure previous steps were executed.")
else:
    # 1. Split the training data into a smaller training set and a validation set
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # 2. Define a parameter grid for hyperparameter tuning for the RandomForestClassifier
    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }

    # 3. Use GridSearchCV with the RandomForestClassifier
    grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1_weighted')
    grid_search_rf.fit(X_train_split, y_train_split)

    # 4. Print the best parameters found by GridSearchCV for the RandomForestClassifier
    print("Best parameters for RandomForestClassifier:")
    print(grid_search_rf.best_params_)

    # 5. Evaluate the best RandomForestClassifier model on the validation set
    best_rf_model = grid_search_rf.best_estimator_
    y_pred_rf_val = best_rf_model.predict(X_val)
    f1_val_rf = f1_score(y_val, y_pred_rf_val, average='weighted')
    print(f"F1-weighted score on validation set (RandomForest): {f1_val_rf:.4f}")

    # 6. Define a parameter grid for hyperparameter tuning for the XGBClassifier
    # Encode the split target variables for XGBoost
    label_encoder_xgb = LabelEncoder()
    y_train_encoded_split = label_encoder_xgb.fit_transform(y_train_split)
    y_val_encoded = label_encoder_xgb.transform(y_val)

    param_grid_xgb = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1]
    }

    # 7. Use GridSearchCV with the XGBClassifier
    grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'), param_grid_xgb, cv=5, scoring='f1_weighted')
    grid_search_xgb.fit(X_train_split, y_train_encoded_split)

    # 8. Print the best parameters found by GridSearchCV for the XGBClassifier
    print("\nBest parameters for XGBClassifier:")
    print(grid_search_xgb.best_params_)

    # 9. Make predictions on the encoded validation set using the best XGBClassifier model
    best_xgb_model = grid_search_xgb.best_estimator_
    y_pred_xgb_encoded_val = best_xgb_model.predict(X_val)

    # 10. Decode the predictions back to original labels
    y_pred_xgb_val = label_encoder_xgb.inverse_transform(y_pred_xgb_encoded_val)

    # 11. Evaluate the best XGBClassifier model on the validation set
    f1_val_xgb = f1_score(y_val, y_pred_xgb_val, average='weighted')
    print(f"F1-weighted score on validation set (XGBoost): {f1_val_xgb:.4f}")


## Model evaluation

### 
Evaluate the final chosen model using appropriate metrics for multi-class classification (e.g., accuracy, precision, recall, F1-score) on a separate test set.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Select the best performing model based on validation F1 score
# Compare f1_val_rf and f1_val_xgb
if f1_val_xgb > f1_val_rf:
    final_model = best_xgb_model
    print("Selected XGBoost as the final model based on validation F1 score.")
    # Re-encode y_test for final evaluation with XGBoost if needed
    y_test_encoded_final = label_encoder_xgb.transform(y_test)
    y_pred_final = final_model.predict(X_test)
    y_pred_final = label_encoder_xgb.inverse_transform(y_pred_final) # Decode predictions
else:
    final_model = best_rf_model
    print("Selected RandomForest as the final model based on validation F1 score.")
    y_pred_final = final_model.predict(X_test)


# Calculate evaluation metrics on the test set
accuracy_test = accuracy_score(y_test, y_pred_final)
precision_test = precision_score(y_test, y_pred_final, average='weighted')
recall_test = recall_score(y_test, y_pred_final, average='weighted')
f1_test = f1_score(y_test, y_pred_final, average='weighted')

# Print the evaluation metrics for the final model on the test set
print("\nFinal Model Performance on Test Set:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision (weighted): {precision_test:.4f}")
print(f"Recall (weighted): {recall_test:.4f}")
print(f"F1-score (weighted): {f1_test:.4f}")

Train the selected improved model (RandomForestClassifier) using cross-validation on the training data to get a more robust performance estimate and potentially tune hyperparameters if necessary.



In [None]:
from sklearn.model_selection import cross_val_score

# Define the model to be used for cross-validation (the random_forest_model instantiated in the previous step is suitable)
# Use the same random_forest_model instance

# Perform 5-fold cross-validation on the training data
cv_scores = cross_val_score(random_forest_model, X_train, y_train, cv=5, scoring='f1_weighted')

# Print the mean and standard deviation of the cross-validation F1 scores
print("Cross-validation F1-weighted scores:", cv_scores)
print(f"Mean CV F1-weighted score: {cv_scores.mean():.4f}")
print(f"Standard deviation of CV F1-weighted scores: {cv_scores.std():.4f}")

Evaluate the trained Random Forest model on the test set using accuracy, weighted precision, weighted recall, and weighted F1-score.



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set using the trained random_forest_model
y_pred_rf_test = random_forest_model.predict(X_test)

# Calculate evaluation metrics
accuracy_test = accuracy_score(y_test, y_pred_rf_test)
precision_test = precision_score(y_test, y_pred_rf_test, average='weighted')
recall_test = recall_score(y_test, y_pred_rf_test, average='weighted')
f1_test = f1_score(y_test, y_pred_rf_test, average='weighted')

# Print the evaluation metrics
print("Random Forest Model Performance on Test Set:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision (weighted): {precision_test:.4f}")
print(f"Recall (weighted): {recall_test:.4f}")
print(f"F1-score (weighted): {f1_test:.4f}")

## Analysis of failure modes and lessons learned

### 
Analyze where the model performs poorly and try to understand the reasons. Document lessons learned throughout the process.



This makes a classification report and confusion matrix to understand the model's performance on individual classes and identify patterns in misclassifications.



In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate classification report
report = classification_report(y_test, y_pred_rf_test)
print("Classification Report:")
print(report)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf_test, labels=random_forest_model.classes_)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=random_forest_model.classes_, yticklabels=random_forest_model.classes_)
plt.xlabel('Predicted Role')
plt.ylabel('True Role')
plt.title('Confusion Matrix')
plt.show()