# Assignment 4: Text Classification

    Author: Group F - Gaurav, Xiaowen Sun, Jheel Harnish Kamdar, Ruijia Xiong
    Created at: 04/09/2024

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
import torch
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import warnings
warnings.filterwarnings("ignore")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 9

## 1. Data Exploration

In [None]:
train_dataset = load_dataset('yelp_review_full', split='train')
test_dataset = load_dataset('yelp_review_full', split='test')
print(train_dataset)
print(test_dataset)
print('Train dataset label values', Counter(train_dataset['label']))
print('Test dataset label values', Counter(test_dataset['label']))



*   The yelp_review dataset has the train datset and the test dataset. Both datasets have two columns: text and label. Both datasets are well-labeled.
*    The train dataset has 650,000 rows of data, and the test dataset has 50,000 entities. Both dataset is balanced - each of the 5 classes has same amount of entites.
*   **For Our Assignment 4, we'll be using 10,000 rows of data from the test dataset.**






In [None]:
# dataset = load_dataset('yelp_review_full', split='train[:20%]')
dataset = load_dataset('yelp_review_full', split='train').shuffle(seed=seed).select(range(100))
df = dataset.to_pandas()
print(f'test data frame shape: {df.shape}')

In [None]:
print(df.label.value_counts())

# Visualize the distribution of review ratings
plt.figure(figsize=(10, 6))  # Increasing the figure size for better readability
sns.countplot(x='label', data=df)
plt.title('Distribution of Ratings in Yelp Reviews', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.grid(True)  # Add grid for better readability of the plot
plt.show()

In [None]:
lens=[len(i.split()) for i in df.text]
plt.hist(lens)

# 2. Sentiment Analysis

In [None]:
df['binary label'] = np.where(df['label'] < 3, 0, 1)

In [None]:
pipelines = {
    'BERT': pipeline('sentiment-analysis', model='textattack/bert-base-uncased-SST-2', tokenizer='textattack/bert-base-uncased-SST-2', truncation=True, padding=True, device=device),
    'BERT2': pipeline('sentiment-analysis', model='textattack/bert-base-uncased-yelp-polarity', tokenizer='textattack/bert-base-uncased-yelp-polarity', truncation=True, padding=True, device=device),
    'tuned_BERT': pipeline('sentiment-analysis', model='LiYuan/amazon-review-sentiment-analysis', tokenizer='LiYuan/amazon-review-sentiment-analysis', truncation=True, padding=True, device=device),
    'RoBERTa': pipeline('sentiment-analysis', model='textattack/roberta-base-SST-2', tokenizer='textattack/roberta-base-SST-2',  truncation=True, padding=True, device=device),
    'DistilBERT': pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', tokenizer='distilbert-base-uncased-finetuned-sst-2-english', truncation=True, padding=True, device=device)
}

In [None]:
%%time
def sentiment_analysis(example):
    bert = pipelines['BERT'](example['text'])[0]['label']
    bert2 = pipelines['BERT2'](example['text'])[0]['label']
    roberta = pipelines['RoBERTa'](example['text'])[0]['label']
    distilbert = pipelines['DistilBERT'](example['text'])[0]['label']
    return {'text': example['text'],
            'BERT': bert,
            'BERT2': bert2,
            'RoBERTa': roberta,
            'DistilBERT': distilbert}
dataset_labels = dataset.map(sentiment_analysis)

* Convert labels to intergers: 0-Negative, 1-Positive  

In [None]:
for col in ['BERT', 'BERT2', 'RoBERTa', 'DistilBERT']:
    df[col] = dataset_labels[col]
df['BERT'] = df['BERT'].apply(lambda x: int(x[-1]))
df['BERT2'] = df['BERT2'].apply(lambda x: int(x[-1]))
df['RoBERTa'] = df['RoBERTa'].apply(lambda x: int(x[-1]))
df['DistilBERT'] = np.where(df['DistilBERT'] == 'NEGATIVE', 0, 1)

In [None]:
for col in ['BERT', 'BERT2', 'RoBERTa', 'DistilBERT']:
  plt.figure(figsize=(10, 6))
  sns.countplot(x=col, data=df)
  plt.title('Distribution of Ratings in Yelp Reviews', fontsize=16)
  plt.xlabel('Rating', fontsize=14)
  plt.ylabel('Count', fontsize=14)
  plt.grid(True)
  plt.show()

In [None]:
# Print the summarized texts
for example in dataset_labels.select(50):
    print('===============================================================')
    print(len(example['text'].split()))
    print("Original Text:", example['text'])
    print('\n')
    print(f"bert: {example['BERT']}")
    print(f"bert2: {example['BERT2']}")
    print(f"roberta: {example['RoBERTa']}")
    print(f"distilbert: {example['DistilBERT']}")

In [None]:
def sentiment_intensity_analysis():
    # Creating a new column for sentiment intensity based on original ratings
    df['intensity'] = df['label'].apply(lambda x: 'Low' if x < 2 else 'Medium' if x == 2 else 'High')

    # Plotting model performance by sentiment intensity
    for model_name in pipelines:
        sns.countplot(x=df[model_name], hue=df['intensity'])
        plt.title(f'Sentiment Intensity Distribution for {model_name}')
        plt.xlabel('Model Prediction')
        plt.ylabel('Count')
        plt.legend(title='Sentiment Intensity')
        plt.show()

sentiment_intensity_analysis()

In [None]:
# Comparative accuracy across models by original star ratings
def comparative_accuracy_by_rating():
    ratings = sorted(df['label'].unique())
    accuracies = {model: [] for model in pipelines.keys()}

    for rating in ratings:
        subset = df[df['label'] == rating]
        for model in pipelines.keys():
            acc = accuracy_score(subset['binary_label'], subset[model])
            accuracies[model].append(acc)

    # Plotting
    for model, acc_list in accuracies.items():
        plt.plot(ratings, acc_list, label=model)

    plt.title('Model Accuracy by Original Star Ratings')
    plt.xlabel('Star Rating')
    plt.ylabel('Accuracy')
    plt.xticks(ratings)
    plt.legend()
    plt.grid(True)
    plt.show()

comparative_accuracy_by_rating()

# 3. Classification Tasks
## 3.a Classification Based on Summaries

## 3.b Zero-Shot Classification

In [None]:
def sum_sentiment_analysis(example):
    bert = pipelines['BERT'](example['summary'])[0]['label']
    bert2 = pipelines['BERT2'](example['summary'])[0]['label']
    roberta = pipelines['RoBERTa'](example['summary'])[0]['label']
    distilbert = pipelines['DistilBERT'](example['summary'])[0]['label']
    return {'summary': example['summary'],
            'BERT': bert,
            'BERT2': bert2,
            'RoBERTa': roberta,
            'DistilBERT': distilbert}
sum_dataset_labels = dataset_with_summaries.map(sum_sentiment_analysis)
for model in ['BERT', 'BERT2', 'RoBERTa', 'DistilBERT']:
    col = f'sum_{model}'
    df[col] = sum_dataset_labels[col]
df['sum_BERT'] = df['sum_BERT'].apply(lambda x: int(x[-1]))
df['sum_BERT2'] = df['sum_BERT2'].apply(lambda x: int(x[-1]))
df['sum_RoBERTa'] = df['sum_RoBERTa'].apply(lambda x: int(x[-1]))
df['sum_DistilBERT'] = np.where(df['sum_DistilBERT'] == 'NEGATIVE', 0, 1)

In [None]:
%%time
zero_pipelines = {
    'BART':pipeline(task="zero-shot-classification", model = "facebook/bart-large-mnli", device=device),
    'DeBERTa':pipeline(task="zero-shot-classification", model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", tokenizer = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device=device),
    # sentence transformer: cross encoder
    'CrossEncoder': pipeline(task="zero-shot-classification", model = "cross-encoder/nli-MiniLM2-L6-H768", tokenizer = "cross-encoder/nli-MiniLM2-L6-H768", device=device)}

def zero_shot_classification(example, candidate_tags, pipeline):
    zeroshot = pipeline(example['text'], candidate_tags)
    return {'text': example['text'], 'zeroshot': zeroshot}

outcome = defaultdict(list)
for name in zero_pipelines:
    _ = dataset_100.map(lambda example: zero_shot_classification(example, candidate_tags, zero_pipelines[name]))
    for element in _['zeroshot']:
        outcome[name].append(element['labels'][0])