In [None]:
!pip install datasets


In [None]:
from datasets import load_dataset
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import multiprocessing as mp
import numpy as np

In [None]:
from google.colab import files

ds = load_dataset('yelp_review_full')

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [None]:
# Transform the dataset to Pandas DataFrames for manipulation
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['test'])

In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split


In [None]:
# Split the train set into train and validation sets
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42)

# Convert back to Dataset
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)

# Create a new DatasetDict with train, validation, and test sets
new_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': Dataset.from_pandas(df_test)
})

# Print the new dataset dict to confirm the split
print(new_dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 526500
    })
    validation: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 58500
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("juliensimon/reviews-sentiment-analysis")

# Ensure the model is in evaluation mode
model.eval()

# Function to tokenize and pad text sequences
def tokenize_and_pad(texts, tokenizer, max_length=512):
    encoded_inputs = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encoded_inputs

# Function to process in batches
def process_in_batches(texts, tokenizer, model, batch_size=32, max_length=512):
    all_preds = []
    model.to('cuda') if torch.cuda.is_available() else model.to('cpu')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokenized_batch = tokenize_and_pad(batch_texts, tokenizer, max_length)
        input_ids = tokenized_batch['input_ids'].to(device)
        attention_mask = tokenized_batch['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)

    return all_preds

# Extract the text data from the test set
test_texts = df_test['text'].tolist()

# Process the texts in batches
preds_list = process_in_batches(test_texts, tokenizer, model, batch_size=32)

# Evaluate the performance of the model
true_labels = df_test['label'].tolist()  # True labels
accuracy = accuracy_score(true_labels, preds_list)
report = classification_report(true_labels, preds_list)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")



Accuracy: 0.20488
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.98      0.50     10000
           1       0.02      0.04      0.03     10000
           2       0.00      0.00      0.00     10000
           3       0.00      0.00      0.00     10000
           4       0.00      0.00      0.00     10000

    accuracy                           0.20     50000
   macro avg       0.07      0.20      0.10     50000
weighted avg       0.07      0.20      0.10     50000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle

# Save predictions to a file
def save_predictions(preds, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(preds, f)

# Save evaluation metrics to a file
def save_evaluation(accuracy, report, file_path):
    evaluation = {
        'accuracy': accuracy,
        'classification_report': report
    }
    with open(file_path, 'w') as f:
        f.write(str(evaluation))

# Load predictions from a file
def load_predictions(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Load evaluation metrics from a file
def load_evaluation(file_path):
    with open(file_path, 'r') as f:
        return eval(f.read())

# Evaluate the performance of the model
true_labels = df_test['label'].tolist()
accuracy = accuracy_score(true_labels, preds_list)
report = classification_report(true_labels, preds_list)

# Save the predictions and evaluation metrics
save_predictions(preds_list, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Pre-trained_predictions.pkl')
save_evaluation(accuracy, report, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Pre-trained_evaluation.txt')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Load Predictions and Evaluation
try:
    # Try to load previously saved predictions and evaluation metrics
    preds_list = load_predictions('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Pre-trained_predictions.pkl')
    eval_data = load_evaluation('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/Pre-trained_evaluation.txt')
    accuracy = eval_data['accuracy']
    report = eval_data['classification_report']

    print(f"Loaded Accuracy: {accuracy}")
    print(f"Loaded Classification Report:\n{report}")
except FileNotFoundError:
    # If files are not found, you would need to run the model to generate them
    print("Files not found. Please run the model to generate predictions and evaluation metrics.")


Loaded Accuracy: 0.20488
Loaded Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.98      0.50     10000
           1       0.02      0.04      0.03     10000
           2       0.00      0.00      0.00     10000
           3       0.00      0.00      0.00     10000
           4       0.00      0.00      0.00     10000

    accuracy                           0.20     50000
   macro avg       0.07      0.20      0.10     50000
weighted avg       0.07      0.20      0.10     50000

