In [2]:
!pip install optuna
import pandas as pd
import time
import numpy as np
import os
import optuna
import logging
import traceback
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from google.colab import drive

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Define global variables
absolute_path = None
X_train, y_train = None, None
X_val, y_val = None, None
X_test, y_test = None, None

# Function to create and evaluate a Random Forest model with given hyperparameters
def create_and_evaluate_model(params, X_train, y_train, X_val, y_val):
    try:
        # Create TF-IDF vectorizer
        tfidf = TfidfVectorizer(
            max_features=params['max_features'],
            ngram_range=params['ngram_range'],
            stop_words=params['stop_words']
        )

        # Create Random Forest classifier
        rf = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            max_features=params['rf_max_features'],
            bootstrap=params['bootstrap'],
            class_weight=params['class_weight'],
            random_state=42,
            n_jobs=-1  # Use all cores
        )

        # Create pipeline
        pipeline = Pipeline([
            ("tfidf", tfidf),
            ("rf", rf)
        ])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate on validation set
        y_val_pred = pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)

        return pipeline, accuracy, None  # No equivalent to 'hinge_loss' for RF

    except Exception as e:
        logger.error(f"Error in model creation/evaluation: {str(e)}")
        logger.error(traceback.format_exc())
        return None, 0.0, None

# Objective function for Optuna
def objective(trial):
    global X_train, y_train, X_val, y_val, absolute_path

    # Define the hyperparameter search space
    params = {
        # TF-IDF parameters
        'max_features': trial.suggest_categorical('max_features', [5000, 10000, 15000, None]),
        'ngram_range': trial.suggest_categorical('ngram_range', [(1, 1), (1, 2), (1, 3)]),
        'stop_words': trial.suggest_categorical('stop_words', [None, 'english']),

        # Random Forest parameters
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 100, step=10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'rf_max_features': trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample']),
    }

    # Log selected hyperparameters
    logger.info(f"Trial {trial.number} hyperparameters: {params}")

    # Create and evaluate model
    model, accuracy, _ = create_and_evaluate_model(params, X_train, y_train, X_val, y_val)

    # Save the trial results to the all_trials.txt file
    trial_results_path = os.path.join(absolute_path, 'OptimizationResults', 'rf_all_trials.txt')
    os.makedirs(os.path.dirname(trial_results_path), exist_ok=True)

    with open(trial_results_path, 'a') as f:
        f.write(f"\n----- Trial {trial.number} -----\n")
        f.write(f"Validation Accuracy: {accuracy:.4f}\n")
        for key, value in params.items():
            f.write(f"{key}: {value}\n")
        f.write("--------------------------\n")

    if model is None:
        return 0.0  # Return poor performance for failed trials

    return accuracy  # We want to maximize accuracy

# Main function to run optimization
def run_rf_optimization(n_trials=100, timeout=None):
    global X_train, y_train, X_val, y_val, X_test, y_test, absolute_path

    start_time = time.time()
    logger.info(f"Starting Random Forest optimization with {n_trials} trials, timeout={timeout}")

    # Mount Google Drive
    try:
        drive.mount('/content/gdrive', force_remount=True)
        logger.info("Google Drive mounted successfully")
    except Exception as e:
        logger.error(f"Error mounting Google Drive: {str(e)}")
        raise

    # Set paths
    absolute_path = "/content/gdrive/My Drive/Projects/Financial-Sentiment/"
    dataset_path = absolute_path + "Datasets/"
    os.makedirs(os.path.join(absolute_path, 'OptimizationResults'), exist_ok=True)

    # Initialize the all trials file with a header
    all_trials_path = os.path.join(absolute_path, 'OptimizationResults', 'rf_all_trials.txt')
    with open(all_trials_path, 'w') as f:
        f.write("========== RANDOM FOREST HYPERPARAMETER OPTIMIZATION RESULTS ==========\n")
        f.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Number of trials: {n_trials}\n")
        f.write("=============================================================\n")

    # Load datasets
    logger.info("Loading datasets")
    train_df = pd.read_csv(dataset_path + "train_set.csv")
    val_df = pd.read_csv(dataset_path + "validation_set.csv")
    test_df = pd.read_csv(dataset_path + "test_set.csv")

    # Extract features and labels
    X_train, y_train = train_df["Sentence"], train_df["SentimentNumerical"]
    X_val, y_val = val_df["Sentence"], val_df["SentimentNumerical"]
    X_test, y_test = test_df["Sentence"], test_df["SentimentNumerical"]

    logger.info(f"Train set size: {len(X_train)}")
    logger.info(f"Validation set size: {len(X_val)}")
    logger.info(f"Test set size: {len(X_test)}")

    # Create an Optuna study
    study = optuna.create_study(
        direction='maximize',  # Maximize accuracy
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)
    )

    # Run the optimization
    try:
        study.optimize(objective, n_trials=n_trials, timeout=timeout)
    except KeyboardInterrupt:
        logger.info("Optimization stopped by user")
    except Exception as e:
        logger.error(f"Error during optimization: {str(e)}")
        logger.error(traceback.format_exc())

    # Log summary to the all trials file
    with open(all_trials_path, 'a') as f:
        f.write("\n========== OPTIMIZATION SUMMARY ==========\n")
        f.write(f"Completed: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total optimization time: {(time.time() - start_time) / 60:.2f} minutes\n")
        f.write(f"Number of trials: {len(study.trials)}\n")
        f.write(f"Number of completed trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])}\n")

    # Get best trial if any completed successfully
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if completed_trials:
        best_trial = study.best_trial
        logger.info(f"Best trial:")
        logger.info(f"  Value (validation accuracy): {best_trial.value:.4f}")
        logger.info(f"  Params:")
        for key, value in best_trial.params.items():
            logger.info(f"    {key}: {value}")

        # Save the best model results to a detailed file
        best_model_path = os.path.join(absolute_path, 'OptimizationResults', 'rf_best_model_results.txt')
        with open(best_model_path, 'w') as f:
            f.write("========== RANDOM FOREST BEST MODEL RESULTS ==========\n")
            f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"Best Trial Number: {best_trial.number}\n")
            f.write(f"Best Validation Accuracy: {best_trial.value:.4f}\n\n")
            f.write("Hyperparameters:\n")
            f.write("--------------\n")
            for key, value in best_trial.params.items():
                f.write(f"{key}: {value}\n")
            f.write("\n")

            # Add some statistics and analysis
            f.write("Performance Analysis:\n")
            f.write("--------------\n")
            f.write(f"Total trials completed: {len(completed_trials)}\n")
            f.write(f"Optimization time: {(time.time() - start_time) / 60:.2f} minutes\n")

            # Compare to worst trial
            worst_trial = min(completed_trials, key=lambda t: t.value)
            f.write(f"Worst trial accuracy: {worst_trial.value:.4f} (Trial {worst_trial.number})\n")
            f.write(f"Improvement over worst: {(best_trial.value - worst_trial.value):.4f} ({(best_trial.value / worst_trial.value - 1) * 100:.2f}%)\n")

            # Calculate average performance
            avg_accuracy = sum(t.value for t in completed_trials) / len(completed_trials)
            f.write(f"Average trial accuracy: {avg_accuracy:.4f}\n")
            f.write(f"Improvement over average: {(best_trial.value - avg_accuracy):.4f} ({(best_trial.value / avg_accuracy - 1) * 100:.2f}%)\n\n")

            # Add parameter importance if available
            try:
                importance = optuna.importance.get_param_importances(study)
                f.write("Parameter Importance:\n")
                f.write("--------------\n")
                for param, score in importance.items():
                    f.write(f"{param}: {score:.4f}\n")
            except Exception as e:
                f.write(f"Could not calculate parameter importance: {str(e)}\n")

            f.write("\n========== END OF REPORT ==========\n")

        logger.info(f"Best model results saved to {best_model_path}")

        # Return the best parameters for final model training
        return best_trial.params, study
    else:
        logger.warning("No trials completed successfully")
        return None, None

# Function to train and evaluate the final model with the best hyperparameters
def train_final_rf_model(best_params):
    global X_train, y_train, X_val, y_val, X_test, y_test, absolute_path

    logger.info("Training final Random Forest model with best hyperparameters")

    # Create and train the final model
    final_model, accuracy, _ = create_and_evaluate_model(
        best_params, X_train, y_train, X_val, y_val
    )

    if final_model is None:
        logger.error("Failed to train final model")
        return

    # Evaluate on validation set
    y_val_pred = final_model.predict(X_val)
    val_report = classification_report(y_val, y_val_pred)

    logger.info(f"Final model validation accuracy: {accuracy:.4f}")
    logger.info(f"Validation Classification Report:\n{val_report}")

    # Save validation report to the best model results file
    best_model_path = os.path.join(absolute_path, 'OptimizationResults', 'rf_best_model_results.txt')
    with open(best_model_path, 'a') as f:
        f.write("\n========== FINAL MODEL EVALUATION ==========\n")
        f.write(f"Validation Accuracy: {accuracy:.4f}\n")
        f.write("Classification Report:\n")
        f.write(val_report)
        f.write("\n")

    # Measure prediction time for each sample in test set
    logger.info("Evaluating prediction time on test set")
    rf_predictions = []
    prediction_times = []

    for sentence in X_test:
        start_time = time.time()
        prediction = final_model.predict([sentence])[0]
        end_time = time.time()
        elapsed_time = end_time - start_time
        rf_predictions.append(prediction)
        prediction_times.append(elapsed_time)

    # Calculate average prediction time
    avg_prediction_time = sum(prediction_times) / len(prediction_times)
    logger.info(f"Average prediction time per sample: {avg_prediction_time:.6f} seconds")

    # Store predictions and time taken in the test DataFrame
    dataset_path = absolute_path + "Datasets/"
    test_df_copy = pd.read_csv(dataset_path + "test_set.csv")
    test_df_copy["rf_predictions"] = rf_predictions
    test_df_copy["time_rf"] = prediction_times

    # Save the updated test set with predictions and time
    output_path = dataset_path + "test_set_with_rf_predictions.csv"
    test_df_copy.to_csv(output_path, index=False)
    logger.info(f"Test set with predictions saved to {output_path}")

    # Evaluate on test set
    y_test_pred = final_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred)

    logger.info(f"Test set accuracy: {test_accuracy:.4f}")
    logger.info(f"Test Classification Report:\n{test_report}")

    # Add test evaluation to the results file
    with open(best_model_path, 'a') as f:
        f.write("\n========== TEST SET EVALUATION ==========\n")
        f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
        f.write("Classification Report:\n")
        f.write(test_report)
        f.write(f"\nAverage prediction time: {avg_prediction_time:.6f} seconds\n")
        f.write("========== END OF FINAL EVALUATION ==========\n")

    # Feature importance analysis
    try:
        rf_model = final_model.named_steps['rf']
        feature_names = final_model.named_steps['tfidf'].get_feature_names_out()

        # Get feature importances
        importances = rf_model.feature_importances_

        # Create DataFrame with features and importances
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        })

        # Sort by importance
        feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

        # Save top 50 features
        top_features_path = os.path.join(absolute_path, 'OptimizationResults', 'rf_feature_importance.csv')
        feature_importance_df.head(50).to_csv(top_features_path, index=False)
        logger.info(f"Top 50 features saved to {top_features_path}")

        # Add to the results file
        with open(best_model_path, 'a') as f:
            f.write("\n========== FEATURE IMPORTANCE ==========\n")
            f.write("Top 20 Most Important Features:\n")
            for idx, row in feature_importance_df.head(20).iterrows():
                f.write(f"{row['Feature']}: {row['Importance']:.6f}\n")
            f.write(f"\nComplete feature importance saved to: {top_features_path}\n")
    except Exception as e:
        logger.error(f"Error analyzing feature importance: {str(e)}")

    # Save model using joblib
    try:
        import joblib
        model_save_path = os.path.join(absolute_path, 'TrainedModels', 'rf_optimized_model.joblib')
        os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
        joblib.dump(final_model, model_save_path)
        logger.info(f"Model saved to {model_save_path}")

        # Add model path to results file
        with open(best_model_path, 'a') as f:
            f.write(f"\nFinal model saved to: {model_save_path}\n")

    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        logger.error(traceback.format_exc())

# Visualization function
def visualize_optuna_study(study):
    """Visualize the results of an Optuna study."""
    # Only proceed if we have completed trials
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if not completed_trials:
        logger.warning("No completed trials to visualize")
        return

    try:
        logger.info("Generating visualization plots")

        # Plot optimization history
        fig1 = optuna.visualization.plot_optimization_history(study)
        fig1.show()

        # Plot parameter importances
        fig2 = optuna.visualization.plot_param_importances(study)
        fig2.show()

        # Plot parallel coordinate plot
        fig3 = optuna.visualization.plot_parallel_coordinate(study)
        fig3.show()

        # Plot slice plot
        fig4 = optuna.visualization.plot_slice(study)
        fig4.show()

    except Exception as e:
        logger.error(f"Visualization failed: {str(e)}")
        logger.error(traceback.format_exc())

# Main execution
if __name__ == "__main__":
    logger.info("========== STARTING RANDOM FOREST HYPERPARAMETER OPTIMIZATION ==========")

    # Optional: Install required packages if needed
    try:
        import optuna
    except ImportError:
        logger.info("Installing optuna...")
        !pip install optuna
        import optuna

    try:
        import joblib
    except ImportError:
        logger.info("Installing joblib...")
        !pip install joblib
        import joblib

    # Run optimization with 100 trials (adjust as needed)
    try:
        best_params, study = run_rf_optimization(n_trials=100, timeout=None)

        # If optimization was successful, train the final model
        if best_params:
            # Train final model with best parameters
            train_final_rf_model(best_params)

            # Visualize the study results if the study object exists
            if study:
                visualize_optuna_study(study)

    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        logger.error(traceback.format_exc())

    logger.info("========== RANDOM FOREST HYPERPARAMETER OPTIMIZATION COMPLETED ==========")

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1
Mounted at /content/gdrive


[I 2025-04-13 02:37:46,071] A new study created in memory with name: no-name-1ab21a2a-393f-4caa-a555-fd1020796ee7
[I 2025-04-13 02:37:46,699] Trial 0 finished with value: 0.6201550387596899 and parameters: {'max_features': 10000, 'ngram_range': (1, 3), 'stop_words': 'english', 'n_estimators': 55, 'max_depth': 80, 'min_samples_split': 12, 'min_samples_leaf': 6, 'rf_max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.6201550387596899.
[I 2025-04-13 02:38:42,093] Trial 1 finished with value: 0.5 and parameters: {'max_features': 10000, 'ngram_range': (1, 3), 'stop_words': 'english', 'n_estimators': 262, 'max_depth': 80, 'min_samples_split': 15, 'min_samples_leaf': 6, 'rf_max_features': None, 'bootstrap': False, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.6201550387596899.
[I 2025-04-13 02:38:47,799] Trial 2 finished with value: 0.5891472868217055 and parameters: {'max_features': None, 'ngram_range': (1, 3)