# LLM Sentiment Analysis

Project by Abhi Vellore

Imports

In [1]:
from pre_process import AmazonDatasetPreprocessor, KaggleDatasetPreprocessor, SentenceDatasetPreprocessor
from config import PROCESSED_DATA_PATH
import os
import pandas as pd
from gpt import ChatGPTSession
import glob
from llama import LLaMaSession

import sys
sys.path.insert(0, '..')

from credentials import openai_api_key, llama_api_key

# Preprocess Data

In [None]:
amazon_processor = AmazonDatasetPreprocessor('Amazon_Fashion_Review_Data.json')
amazon_processor.preprocess()
amazon_processor.to_csv('processed_amazon_data.csv')

# Initialize and process the Sentence data
sentence_processor = SentenceDatasetPreprocessor('Sentences_75Agree.txt')
sentence_processor.preprocess()
sentence_processor.to_csv('processed_sentence_data.csv')

# Initialize and process the first Kaggle dataset
kaggle1_processor = KaggleDatasetPreprocessor('kaggle_train.csv')
kaggle1_processor.preprocess()

# Initialize and process the second Kaggle dataset
kaggle2_processor = KaggleDatasetPreprocessor('kaggle_test.csv')
kaggle2_processor.preprocess()

# Concatenate the preprocessed DataFrames
processed_kaggle1_df = kaggle1_processor.df
processed_kaggle2_df = kaggle2_processor.df
combined_df = pd.concat([processed_kaggle1_df, processed_kaggle2_df], ignore_index=True)

# Sample approximately 3000 rows from the combined DataFrame
sampled_df = combined_df.sample(n=3000, random_state=42)  # random_state for reproducibility

# Save the combined DataFrame to a new CSV file in the processed directory
combined_csv_path = os.path.join(PROCESSED_DATA_PATH, 'processed_kaggle_combined_data.csv')
sampled_df.to_csv(combined_csv_path, index=False)
print(f"Processed combined Kaggle data saved to {combined_csv_path}")

# Models and Experiments

### Set up and Save Functions

Create a series of functions with checks to prevent hitting OpenAI rate limit safeguards and track expenses of running the datasets.


In [32]:
import time

# Define state-saving functions
def save_state(state_file, last_processed_index):
    with open(state_file, 'w') as file:
        file.write(str(last_processed_index))

def load_state(state_file):
    try:
        with open(state_file, 'r') as file:
            return int(file.read().strip())
    except FileNotFoundError:
        return 0


In [33]:
daily_limit = 10000
rate_limit_per_minute = 500
state_file_path = 'last_processed_line.txt'


### GPT Model

Create GPT models and use them to perform sentiment analysis

#### Helper Function to process a dataset with GPT - saves into new dataset

In [34]:
def process_dataset_with_GPTModel(input_csv_path, output_csv_path, chat_session, column_name, state_file):
    df = pd.read_csv(input_csv_path)
    start_index = load_state(state_file)  # Load the last processed index
    processed_count = 0

    for index, row in df.iterrows():
        if index < start_index:
            continue  # Skip already processed rows

        # Check if we've reached the daily limit before processing the next row
        if processed_count >= daily_limit:
            print("Reached the daily limit, stopping...")
            break

        # Insert your API call here and store the response
        response = chat_session.send_prompt(row['Text'])
        df.at[index, column_name] = response

        # Save the state after each line is processed
        save_state(state_file, index)
        processed_count += 1

        # Handle rate limiting
        if processed_count % rate_limit_per_minute == 0 and processed_count != 0:
            print("Rate limit reached, sleeping for 60 seconds...")
            time.sleep(60)

    # Save the modified DataFrame
    df.to_csv(output_csv_path, index=False)
    print(f"Processing completed. Data saved to {output_csv_path}")

#### Initialize GPT Models

Sets context to minimize tokens being used.

In [35]:
sentiment_context = "As a sentiment analysis model, rate the sentiment of the following text from 1 to 5, where 1 is very negative and 5 is very positive. Provide only the number as a response."

# Initialize sessions for GPT-3.5-Turbo and GPT-4.0
session_gpt_3_5 = ChatGPTSession(api_key=openai_api_key, model='gpt-3.5-turbo', rate_limit_per_minute=1000)
session_gpt_4 = ChatGPTSession(api_key=openai_api_key, model='gpt-4', rate_limit_per_minute=300)

session_gpt_3_5.set_context(sentiment_context)
session_gpt_4.set_context(sentiment_context)

#### Processing Financial Data

Split into two separate commands to ensure accuracy. Will use a for loop in the future.

In [None]:
# Process 'processed_sentence_data_.csv' with GPT-3.5
sentence_data_filename = '../data/processed/processed_sentence_data.csv'
output_sentence_path_3_5 = sentence_data_filename.replace('.csv', '_with_gpt_3.5.csv')
df_sentence = process_dataset_with_GPTModel(sentence_data_filename, output_sentence_path_3_5, session_gpt_4, "GPT 4.0 Score", state_file_path)
print(f"Processed {sentence_data_filename} with GPT-4.0 and saved to {output_sentence_path_3_5}")


In [None]:
# Process 'sentence_data_with_gpt_3_5.csv' with GPT-4.0
sentence_data_filename = '../data/processed/processed_sentence_data_with_gpt_3_5.csv'
output_sentence_path_4 = sentence_data_filename.replace('_with_gpt_3_5.csv', '_with_gpt_scores.csv')
df_sentence = process_dataset_with_GPTModel(sentence_data_filename, output_sentence_path_4, session_gpt_4, "GPT 4.0 Score", state_file_path)
print(f"Processed {sentence_data_filename} with GPT-4.0 and saved to {output_sentence_path_4}")


#### Processing other datasets

After confirming setup works for the financial dataset, process the other dataset

In [None]:
# Process other datasets with GPT-3.5 and GPT-4.0
other_datasets = ['../data/processed/processed_amazon_data.csv', 
                  '../data/processed/processed_kaggle_combined_data.csv']

for dataset in other_datasets:
    # Process with GPT-3.5
    output_path_3_5 = dataset.replace('.csv', '_with_gpt_3_5.csv')
    df = process_dataset_with_GPTModel(dataset, output_path_3_5, session_gpt_3_5, "GPT 3.5 Score", state_file_path)
    print(f"Dataset processed with GPT-3.5 and saved to {output_path_3_5}")
    
    # Check if daily limit reached after GPT-3.5 processing
    current_index = load_state(state_file_path)
    if current_index + 1 >= daily_limit:
        print("Daily limit reached, please run GPT-4.0 processing tomorrow.")
        continue  # Continue to the next dataset

    # Process with GPT-4.0
    output_path_4 = dataset.replace('.csv', '_with_gpt_scores.csv')
    df = process_dataset_with_GPTModel(dataset, output_path_4, session_gpt_4, "GPT 4.0 Score", state_file_path)
    print(f"Dataset processed with GPT-4.0 and saved to {output_path_4}")
    
    # Check if daily limit reached after GPT-4.0 processing
    current_index = load_state(state_file_path)
    if current_index + 1 >= daily_limit:
        print("Daily limit reached, please run again tomorrow.")

### LLama Models
Create Llama models and use them to perform sentiment analysis

#### Helper Function to process a dataset with Llama-7b

In [36]:
def process_dataset_with_llama_model(input_csv_path, output_csv_path, llama_session, column_name, state_file):
    df = pd.read_csv(input_csv_path)
    start_index = load_state(state_file)  # Load the last processed index
    processed_count = 0


    for index, row in df.iterrows():
        if index < start_index:
            continue  # Skip already processed rows

        # Check if we've reached the daily limit before processing the next row
        if processed_count >= daily_limit:
            print("Reached the daily limit, stopping...")
            break

        # Insert your API call here and store the response
        response = llama_session.send_prompt(row['text'])
        df.at[index, column_name] = response

        # Save the state after each line is processed
        save_state(state_file, index)
        processed_count += 1

        # Handle rate limiting
        if processed_count % rate_limit_per_minute == 0 and processed_count != 0:
            print("Rate limit reached, sleeping for 60 seconds...")
            time.sleep(60)

    # Save the modified DataFrame
    df.to_csv(output_csv_path, index=False)
    print(f"Processing completed. Data saved to {output_csv_path}")


In [None]:
# Initialize sessions for Llama-7b
session_llama_7b = LLaMaSession(api_key=llama_api_key, model='llama-7b-chat', rate_limit_per_minute=1000)


#### Processing Financial Data

Split into two separate commands to ensure accuracy.

In [None]:
# Process 'processed_sentence_data__with_gpt_scores.csv' with GPT-
sentence_data_filename = '../data/processed/processed_sentence_data_with_gpt_scores.csv'
output_sentence_path_7b = sentence_data_filename.replace('_with_gpt_scores.csv', '_all_scores.csv')
process_dataset_with_llama_model(dataset, output_sentence_path_7b, session_llama_7b, "Llama 7B Score", state_file=state_file_path)
print(f"Processed {sentence_data_filename} with llama and saved to {output_sentence_path_7b}")


#### Processing other datasets

After confirming setup works for the financial dataset, process the other dataset

In [None]:
other_datasets = ['../data/processed/processed_amazon_data_with_gpt_scores.csv', 
                  '../data/processed/processed_kaggle_combined_data_with_gpt_scores.csv']

# Process each dataset with both models
for dataset in other_datasets:
    output_path_7b = dataset.replace('.csv', '_all_scores.csv')
    process_dataset_with_llama_model(dataset, output_path_7b, session_llama_7b, "Llama 7B Score", state_file=state_file_path)
    print(f"Dataset processed with Llama-7B and saved to {output_path_7b}")


    current_index = load_state(state_file_path)
    if current_index + 1 >= daily_limit:
        print("Daily limit reached, please run again tomorrow.")
        break


# Evaluation Metrics

In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt


Functions to evaluate performance

In [39]:
def evaluate_model(predictions, actuals, model_name, dataset_name):
    accuracy = accuracy_score(actuals, predictions)
    f1score = f1_score(actuals, predictions, average='weighted')
    print(f"Accuracy for {model_name} on {dataset_name}: {accuracy:.2f}")
    print(f"F1 Score for {model_name} on {dataset_name}: {f1score:.2f}")
    print(classification_report(actuals, predictions))
    return accuracy, f1score

def plot_confusion_matrix(predictions, actuals, model_name, dataset_name):
    conf_matrix = confusion_matrix(actuals, predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {model_name} - {dataset_name}')
    plt.show()


Iterate over all datasets and plot everything

In [None]:
datasets = ['processed_sentence_data_all_scores.csv', 'processed_kaggle_combined_all_scores.csv', 'processed_amazon_data_all_scores.csv'] 
models = ['GPT_3_5', 'GPT_4', 'LLaMA_7B', 'LLaMA_13B']  # Replace with your actual model names
metrics_df = pd.DataFrame()

for dataset_file in datasets:
    df = pd.read_csv(dataset_file)
    actuals = df['Actual']
    for model in models:
        predictions = df[model]
        accuracy, f1score = evaluate_model(predictions, actuals, model, dataset_file)
        plot_confusion_matrix(predictions, actuals, model, dataset_file)
        metrics_df = metrics_df.append({
            'Dataset': dataset_file,
            'Model': model,
            'Accuracy': accuracy,
            'F1_Score': f1score
        }, ignore_index=True)


Combine all datasets together for aggregated

In [None]:
# Aggregate results
aggregated_metrics = metrics_df.groupby('Model').agg({'Accuracy': 'mean', 'F1_Score': 'mean'}).reset_index()

# Plotting aggregated accuracy for each model
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Accuracy', data=aggregated_metrics)
plt.title('Aggregated Accuracy for Each Model')
plt.show()

# Plotting aggregated F1-score for each model
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='F1_Score', data=aggregated_metrics)
plt.title('Aggregated F1 Score for Each Model')
plt.show()


In [None]:
# Combine all datasets into a single DataFrame for correlation analysis
combined_df = pd.DataFrame()
for dataset_file in datasets:
    df = pd.read_csv(dataset_file)
    combined_df = combined_df.append(df, ignore_index=True)

# Calculate correlations
correlation_matrix = combined_df[models].corr()  # Use model names as column names

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Model Predictions')
plt.show()
