In [1]:
import pandas as pd

# Loading the comments dataset
comments_df = pd.read_csv("comments.csv")
sample_size = 300
comment_sample = comments_df.sample(n=sample_size, random_state=42)
comment_sample.to_csv("comment_sample.csv", index=False)
df = pd.read_csv("comment_sample.csv")

In [10]:
label_counts = df["Label"].value_counts()

print(label_counts)

Label
2    151
0     77
1     72
Name: count, dtype: int64



### Sample Labeling for Testing

In preparation for testing my fine-tuned DistilBERT model, I manually labeled a sample of 300 YouTube comments. The labeling process involved categorizing comments into three sentiment classes: 2 for neutral, 1 for positive, and 0 for negative. However, as the model was fine-tuned specifically for positive and negative sentiment analysis, I plan to remove the neutral class during evaluation to focus on the target sentiments.

The positive and negative sentiment classes turned out to be randomly distributed, with approximately equal weighting. In the sample, there were 72 positive comments and 77 negative comments, while the remaining comments were labeled as neutral. This labeled sample will be used to assess the model's performance on YouTube comments, providing valuable insights into its ability to classify sentiments effectively.

In [19]:
df_test = df.drop(columns="Comment ID")

df_test = df_test[df_test["Label"] != 2]

df_test.head()

Unnamed: 0,Comment,Label
5,You guys are clueless about buyers agents. The...,0
7,This guy sounds like ChatGpt wrote him 😂,0
10,"These woke, sorry “red pill” takers, are going...",0
12,Jcal is killing it lately lol🤣,1
15,I love you JCAL!!,1


In [22]:
import tensorflow as tf
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Extracting the comments from the test dataset
youtube_comments = df_test["Comment"].tolist()

# Tokenizing the YouTube comments using the same tokinzer
tokenized_youtube_comments = tokenizer(
    youtube_comments,
    padding=True,
    truncation=True,
    max_length=103,
    return_tensors="tf"
)

# Extracing input IDs and attenetion mask for the test dataset
input_ids_test = tokenized_youtube_comments["input_ids"]
attention_mask_test = tokenized_youtube_comments["attention_mask"]

# Adding tokenized data to the original DataFrame for the test dataset
df_test["input_ids"] = input_ids_test.numpy().tolist()
df_test["attention_mask"] = attention_mask_test.numpy().tolist()

In [23]:
df_test.head()

Unnamed: 0,Comment,Label,input_ids,attention_mask
5,You guys are clueless about buyers agents. The...,0,"[101, 2017, 4364, 2024, 9789, 3238, 2055, 1739...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,This guy sounds like ChatGpt wrote him 😂,0,"[101, 2023, 3124, 4165, 2066, 11834, 21600, 21...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
10,"These woke, sorry “red pill” takers, are going...",0,"[101, 2122, 8271, 1010, 3374, 1523, 2417, 1735...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
12,Jcal is killing it lately lol🤣,1,"[101, 29175, 2389, 2003, 4288, 2009, 9906, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
15,I love you JCAL!!,1,"[101, 1045, 2293, 2017, 29175, 2389, 999, 999,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."


In [24]:
# Extracting input IDs and attention masks from the DataFrame
input_ids_test = df_test["input_ids"].tolist()
attention_mask_test = df_test["attention_mask"].tolist()

# Converting lists to TensorFlow tensors
input_ids_tensor = tf.convert_to_tensor(input_ids_test, dtype=tf.int32)
attention_mask_tensor = tf.convert_to_tensor(attention_mask_test, dtype=tf.int32)

# Displaying the shapes of the tensors
print("Input IDs Tensor Shape:", input_ids_tensor.shape)
print("Attention Mask Tensor Shape:", attention_mask_tensor.shape)

Input IDs Tensor Shape: (149, 103)
Attention Mask Tensor Shape: (149, 103)


In [31]:
model_path = "../NLP_model/best_sentiment_model"

loaded_model = tf.saved_model.load(model_path)