**Load Data File**

Downloads the file's contents and saves it to the notebook's files.

In [17]:
import requests

data_file = "reddit_comments.csv"

request = requests.get("https://drive.google.com/uc?export=download&id=1grbBKQ8SEcujIYSTiKaDbOXbFpuhTijv")
with open(data_file, "wb") as file:
    file.write(request.content)

**Prepare Dataset and General Setup**

You only need to run the `!pip3 install` code blocks once per session.

In [None]:
!pip install datasets transformers==4.28.0

Set Seed

You need to run this before the Load and Partition Dataset section to ensure the train, valid and test partitions are the same.

In [19]:
from transformers import set_seed

set_seed(42)

Load and Partition Dataset

In [None]:
from datasets import load_dataset, DatasetDict, Features, ClassLabel, Value

dataset = load_dataset('csv', data_files=data_file, split="train", download_mode="reuse_cache_if_exists")

train_testvalid = dataset.train_test_split(test_size=0.2)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

final_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

print(final_dataset)

In [21]:
def count_dataset_divide(dataset_partition, partition_name):
  count_pos = 0
  count_neg = 0
  for entry in dataset_partition:
    if entry["label"] == 0:
      count_neg += 1
    else: #entry["label"] == 1:
      count_pos += 1
  print("partition_name:", partition_name)
  print("positive_count: ", count_pos)
  print("negative_count: ", count_neg)

Examine Final Dataset

In [None]:
count_dataset_divide(final_dataset["train"], "Train")
print(final_dataset["train"][0], "\n")
count_dataset_divide(final_dataset["test"], "Test")
print(final_dataset["test"][0], "\n")
count_dataset_divide(final_dataset["valid"], "Valid")
print(final_dataset["valid"][0], "\n")

**Test**

GPU/CPU

In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [24]:
model_checkpoint = "distilbert-base-uncased"

Tokenizer

In [25]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Encode Dataset

In [None]:
def preprocess_function(entries):
    return tokenizer(entries["comment"], padding=True, truncation=True)

encoded_test_dataset = final_dataset["test"].map(preprocess_function, batched=True, load_from_cache_file=False)

**Test**

In [27]:
from transformers import AutoModelForSequenceClassification, Trainer

model_name = "AG6019/reddit-comment-sentiment-final"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

test_trainer = Trainer(model)

In [28]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(y_pred, labels):
    accuracy = accuracy_score(y_true=labels, y_pred=y_pred)
    recall = recall_score(y_true=labels, y_pred=y_pred)
    precision = precision_score(y_true=labels, y_pred=y_pred)
    f1 = f1_score(y_true=labels, y_pred=y_pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Make predictions on test set

In [None]:
raw_pred , _, _ = test_trainer.predict(encoded_test_dataset)

Evaluate test set

In [None]:
import tensorflow as tf
import numpy as np

y_pred = np.argmax(raw_pred, axis=1)

labels = np.array(final_dataset["test"]["label"])

metrics = compute_metrics(y_pred, labels)
print("Evaluation Metrics")
print(metrics, "\n")

conf_matrix = tf.math.confusion_matrix(labels, y_pred, num_classes=2)
print("Confusion Matrix")
print(conf_matrix, "\n")

Examples from testing set

In [None]:
test_ex_indices = [1, 2, 9, 10, 11, 58]

print("Test Examples")
for i in test_ex_indices:
  print(final_dataset["test"]["comment"][i])
  print("raw prediction:", raw_pred[i])
  print("actual:", labels[i])
  print("predicted:", y_pred[i])
  print()