Imports

In [1]:
from pre_process import AmazonDatasetPreprocessor, KaggleDatasetPreprocessor, SentenceDatasetPreprocessor
from config import PROCESSED_DATA_PATH
import os
import pandas as pd
from gpt import ChatGPTSession
import glob
from llama import LLaMaSession

import sys
sys.path.insert(0, '..')

from credentials import openai_api_key, llama_api_key



Preprocess Data

In [2]:
amazon_processor = AmazonDatasetPreprocessor('Amazon_Fashion_Review_Data.json')
amazon_processor.preprocess()
amazon_processor.to_csv('processed_amazon_data.csv')

# Initialize and process the Sentence data
sentence_processor = SentenceDatasetPreprocessor('Sentences_75Agree.txt')
sentence_processor.preprocess()
sentence_processor.to_csv('processed_sentence_data.csv')

# Initialize and process the first Kaggle dataset
kaggle1_processor = KaggleDatasetPreprocessor('kaggle_train.csv')
kaggle1_processor.preprocess()

# Initialize and process the second Kaggle dataset
kaggle2_processor = KaggleDatasetPreprocessor('kaggle_test.csv')
kaggle2_processor.preprocess()

# Concatenate the preprocessed DataFrames
processed_kaggle1_df = kaggle1_processor.df
processed_kaggle2_df = kaggle2_processor.df
combined_df = pd.concat([processed_kaggle1_df, processed_kaggle2_df], ignore_index=True)

# Sample approximately 3000 rows from the combined DataFrame
sampled_df = combined_df.sample(n=3000, random_state=42)  # random_state for reproducibility

# Save the combined DataFrame to a new CSV file in the processed directory
combined_csv_path = os.path.join(PROCESSED_DATA_PATH, 'processed_kaggle_combined_data.csv')
sampled_df.to_csv(combined_csv_path, index=False)
print(f"Processed combined Kaggle data saved to {combined_csv_path}")

Processed data saved to ../data/processed/processed_amazon_data.csv
Processed data saved to ../data/processed/processed_sentence_data.csv
Processed combined Kaggle data saved to ../data/processed/processed_kaggle_combined_data.csv


# Save Functions

In [None]:
import time

# Define state-saving functions
def save_state(state_file, last_processed_index):
    with open(state_file, 'w') as file:
        file.write(str(last_processed_index))

def load_state(state_file):
    try:
        with open(state_file, 'r') as file:
            return int(file.read().strip())
    except FileNotFoundError:
        return 0


In [None]:
daily_limit = 10000
rate_limit_per_minute = 500
state_file_path = 'last_processed_line.txt'


# GPT Models

In [None]:
def process_dataset_with_GPTModel(input_csv_path, output_csv_path, chat_session, column_name, state_file):
    df = pd.read_csv(input_csv_path)
    start_index = load_state(state_file)  # Load the last processed index
    processed_count = 0

    for index, row in df.iterrows():
        if index < start_index:
            continue  # Skip already processed rows

        # Check if we've reached the daily limit before processing the next row
        if processed_count >= daily_limit:
            print("Reached the daily limit, stopping...")
            break

        # Insert your API call here and store the response
        response = chat_session.send_prompt(row['text'])
        df.at[index, column_name] = response

        # Save the state after each line is processed
        save_state(state_file, index)
        processed_count += 1

        # Handle rate limiting
        if processed_count % rate_limit_per_minute == 0 and processed_count != 0:
            print("Rate limit reached, sleeping for 60 seconds...")
            time.sleep(60)

    # Save the modified DataFrame
    df.to_csv(output_csv_path, index=False)
    print(f"Processing completed. Data saved to {output_csv_path}")

In [None]:
sentiment_context = "As a sentiment analysis model, rate the sentiment of the following text from 1 to 5, where 1 is very negative and 5 is very positive. Provide only the number as a response."

# Initialize sessions for GPT-3.5-Turbo and GPT-4.0
session_gpt_3_5 = ChatGPTSession(api_key=openai_api_key, model='gpt-3.5-turbo', rate_limit_per_minute=1000)
session_gpt_4 = ChatGPTSession(api_key=openai_api_key, model='gpt-4.0', rate_limit_per_minute=300)

session_gpt_3_5.set_context(sentiment_context)
session_gpt_4.set_context(sentiment_context)

# Process each dataset
datasets_path = '../data/processed/*.csv'
datasets = glob.glob(datasets_path)



for dataset in datasets:
    output_path_3_5 = dataset.replace('.csv', '_with_gpt_3_5.csv')
    process_dataset_with_GPTModel(dataset, output_path_3_5, session_gpt_3_5, "GPT 3.5 Score", state_file=state_file_path)
    
    # If the script has hit the daily limit, it will stop and needs to be run again the next day.

    current_index = load_state(state_file_path)
    if current_index + 1 >= daily_limit:
        print("Daily limit reached, please run again tomorrow.")
        break  # Stop processing files if the daily limit has been reached

    output_path_4 = dataset.replace('.csv', '_with_gpt_4.csv')
    process_dataset_with_GPTModel(dataset, output_path_4, session_gpt_4, "GPT 4.0 Score", state_file=state_file_path)

    current_index = load_state(state_file_path)
    if current_index + 1 >= daily_limit:
        print("Daily limit reached, please run again tomorrow.")
        break  # Stop processing files if the daily limit has been reached
   


# LLama Models

In [None]:
def process_dataset_with_llama_model(input_csv_path, output_csv_path, llama_session, column_name, state_file):
    df = pd.read_csv(input_csv_path)
    start_index = load_state(state_file)  # Load the last processed index
    processed_count = 0


    for index, row in df.iterrows():
        if index < start_index:
            continue  # Skip already processed rows

        # Check if we've reached the daily limit before processing the next row
        if processed_count >= daily_limit:
            print("Reached the daily limit, stopping...")
            break

        # Insert your API call here and store the response
        response = llama_session.send_prompt(row['text'])
        df.at[index, column_name] = response

        # Save the state after each line is processed
        save_state(state_file, index)
        processed_count += 1

        # Handle rate limiting
        if processed_count % rate_limit_per_minute == 0 and processed_count != 0:
            print("Rate limit reached, sleeping for 60 seconds...")
            time.sleep(60)

    # Save the modified DataFrame
    df.to_csv(output_csv_path, index=False)
    print(f"Processing completed. Data saved to {output_csv_path}")


In [None]:
sentiment_context = "As a sentiment analysis model, rate the sentiment of the following text from 1 to 5, where 1 is very negative and 5 is very positive. Provide only the number as a response."

# Initialize sessions for GPT-3.5-Turbo and GPT-4.0
session_llama_7b = LLaMaSession(api_key=llama_api_key, model='llama-7b-chat', rate_limit_per_minute=1000)
session_llama_13b = LLaMaSession(api_key=llama_api_key, model='llama-13b-chat', rate_limit_per_minute=300)

datasets_path = '../data/processed/*.csv'
datasets = glob.glob(datasets_path)

# Process each dataset with both models
for dataset in datasets:
    output_path_7b = dataset.replace('.csv', '_with_7b.csv')
    process_dataset_with_llama_model(dataset, output_path_7b, session_llama_7b, "Llama 7B Score", state_file=state_file_path)
    
    output_path_all = dataset.replace('.csv', '_with_all.csv')
    process_dataset_with_llama_model(dataset, output_path_all, session_llama_13b, "Llama 13B Score", state_file=state_file_path)

# Evaluation Metrics

# OLD STUFF

In [6]:
# Example usage
api_key = llama_api_key 
model = "llama-7b-chat"  
rate_limit_per_minute = 120 

# Initialize the LLaMa API session
llama_session = LLaMaSession(api_key, model, rate_limit_per_minute)

df = pd.read_csv('test_experiment.csv')

# Define a function that will apply sentiment analysis to a text
def apply_sentiment_analysis(text):
    try:
        return llama_session.analyze_sentiment(text)
    except Exception as e:
        print(f"Error while processing text '{text}': {e}")
        return None

# Apply sentiment analysis to each row of the DataFrame
df['llama score'] = df['Text'].apply(apply_sentiment_analysis)

# Save the DataFrame with the new "llama score" column to a new CSV file
df.to_csv('test_result.csv', index=False)

In [7]:
# Helper Method to process all GPT
def process_dataset_with_GPTmodel(input_csv_path, output_csv_path, chat_session, column_name):
    df = pd.read_csv(input_csv_path)
    
    if "text" not in df.columns:
        raise ValueError("The dataset must have a 'text' column.")
    
    # Apply the model to the "text" column and save the results in a new column
    df[column_name] = df['text'].apply(lambda x: chat_session.send_prompt(x))
    
    # Save the modified DataFrame
    df.to_csv(output_csv_path, index=False)

