In [None]:
# # PLS GOTO README.MD FIRST!!!!!

%matplotlib inline
import os
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import importlib.util
import sys
from IPython.display import display, Markdown

# Function to import a module from file path
def import_module_from_file(module_name, file_path):
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module

# Check if all required files exist
required_files = [
    'data_preprocessing.py',
    'model.py',
    'training.py',
    'main_bert.py',
    'main_roberta.py',
    'main_distilbert.py'
]

missing_files = [file for file in required_files if not os.path.exists(file)]
if missing_files:
    print(f"The following required files are missing: {missing_files}")
    print("Please make sure all model files are in the current directory")
    exit(1)

# Check if Tweets.csv exists
if not os.path.exists('Tweets.csv'):
    print("Tweets.csv not found. Please upload the dataset first.")
    print("You can upload it using: from google.colab import files; uploaded = files.upload()")
    exit(1)

# Install requirements if needed
try:
    import torch
    import transformers
except ImportError:
    print("Installing required packages...")
    !pip install torch transformers pandas scikit-learn matplotlib seaborn

# Function to run a model and capture its output
def run_model(module_name, script_name, model_name):
    print(f"\n{'='*80}")
    print(f"Running {model_name} model...")
    print(f"{'='*80}\n")

    start_time = time.time()

    # Import and run the model script
    try:
        # Import the module
        module = import_module_from_file(module_name, script_name)

        # Create a string buffer to capture print outputs
        from io import StringIO
        import sys
        original_stdout = sys.stdout
        string_buffer = StringIO()
        sys.stdout = string_buffer

        # Run the main function
        module.main()

        # Restore stdout
        sys.stdout = original_stdout
        output = string_buffer.getvalue()

        # Print captured output
        print(output)

    except Exception as e:
        output = f"Error executing {model_name}: {str(e)}"
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

    end_time = time.time()
    total_time = end_time - start_time

    print(f"\n{model_name} completed in {total_time:.2f} seconds\n")

    return output, total_time

# Dictionary to store results
results = {}

# Run BERT model
print("Starting BERT model fine-tuning...")
bert_output, bert_time = run_model('main_bert', 'main_bert.py', 'BERT')
results['BERT'] = {'time': bert_time, 'output': bert_output}

# Run RoBERTa model
print("Starting RoBERTa model fine-tuning...")
roberta_output, roberta_time = run_model('main_roberta', 'main_roberta.py', 'RoBERTa')
results['RoBERTa'] = {'time': roberta_time, 'output': roberta_output}

# Run DistilBERT model
print("Starting DistilBERT model fine-tuning...")
distilbert_output, distilbert_time = run_model('main_distilbert', 'main_distilbert.py', 'DistilBERT')
results['DistilBERT'] = {'time': distilbert_time, 'output': distilbert_output}

# Function to extract accuracy from output
def extract_accuracy(output):
    for line in output.split('\n'):
        if "Test Accuracy:" in line:
            try:
                # Extract the part after "Test Accuracy:" and convert to float
                accuracy_str = line.split("Test Accuracy:")[1].strip()
                # Remove any % symbol if present
                accuracy_str = accuracy_str.replace('%', '')
                return float(accuracy_str)
            except (ValueError, IndexError):
                pass
    print("Warning: Could not extract accuracy from output")
    return 0.0  # Return a default value instead of None

# Compare model performance
accuracies = {}
times = {}

for model, data in results.items():
    acc = extract_accuracy(data['output'])
    if acc is not None:
        accuracies[model] = acc
    else:
        accuracies[model] = 0.0  # Default value if accuracy can't be extracted
    times[model] = data['time']

# Create comparison plots
plt.figure(figsize=(12, 5))

# Plot accuracies
plt.subplot(1, 2, 1)
models = list(accuracies.keys())
acc_values = [accuracies[model] for model in models]
plt.bar(models, acc_values, color=['blue', 'green', 'red'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Test Accuracy')
plt.ylim(0, 1)

# Plot training times
plt.subplot(1, 2, 2)
time_values = [times[model] for model in models]
plt.bar(models, time_values, color=['blue', 'green', 'red'])
plt.title('Training Time Comparison')
plt.ylabel('Time (seconds)')

plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()

# Display results in a table
comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': acc_values,
    'Training Time (s)': time_values
})
display(comparison_df)

print("\nModel comparison complete!")