In [1]:
import pandas as pd

# Loading the comments dataset
comments_df = pd.read_csv("comments.csv")

In [2]:
df = pd.read_csv("comment_sample.csv")
label_counts = df["Label"].value_counts()

print(label_counts)

Label
2    151
0     77
1     72
Name: count, dtype: int64



### Sample Labeling for Testing

In preparation for testing my fine-tuned DistilBERT model, I manually labeled a sample of 300 YouTube comments. The labeling process involved categorizing comments into three sentiment classes: 2 for neutral, 1 for positive, and 0 for negative. However, as the model was fine-tuned specifically for positive and negative sentiment analysis, I plan to remove the neutral class during evaluation to focus on the target sentiments.

The positive and negative sentiment classes turned out to be randomly distributed, with approximately equal weighting. In the sample, there were 72 positive comments and 77 negative comments, while the remaining comments were labeled as neutral. This labeled sample will be used to assess the model's performance on YouTube comments, providing valuable insights into its ability to classify sentiments effectively.

In [3]:
df_test = df.drop(columns="Comment ID")

df_test = df_test[df_test["Label"] != 2]

df_test.head()

Unnamed: 0,Comment,Label
5,You guys are clueless about buyers agents. The...,0
7,This guy sounds like ChatGpt wrote him 😂,0
10,"These woke, sorry “red pill” takers, are going...",0
12,Jcal is killing it lately lol🤣,1
15,I love you JCAL!!,1


In [4]:
import pandas as pd
import re

df_test["Comment"] = df_test["Comment"].astype(str)

# Preprocess the "Comment" column in one line:
df_test["Comment"] = df_test["Comment"].str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True).str.strip().str.replace(r"\s+", " ", regex=True)

In [5]:
import tensorflow as tf
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Extracting the comments from the test dataset
youtube_comments = df_test["Comment"].tolist()

# Tokenizing the YouTube comments using the same tokinzer
tokenized_youtube_comments = tokenizer(
    youtube_comments,
    padding=True,
    truncation=True,
    max_length=103,
    return_tensors="tf"
)

# Extracing input IDs and attenetion mask for the test dataset
input_ids_test = tokenized_youtube_comments["input_ids"]
attention_mask_test = tokenized_youtube_comments["attention_mask"]

# Adding tokenized data to the original DataFrame for the test dataset
df_test["input_ids"] = input_ids_test.numpy().tolist()
df_test["attention_mask"] = attention_mask_test.numpy().tolist()




  from .autonotebook import tqdm as notebook_tqdm


In [7]:
df_test.head()

df_test.to_csv("../Youtube_comment_scraper/df_test.csv", index=False)

In [9]:
import numpy as np

input_ids_test = df_test["input_ids"].tolist()
attention_mask_test = df_test["attention_mask"].tolist()

# Converting lists to NumPy arrays for YouTube comment data
input_ids_array = np.array(input_ids_test)
attention_mask_array = np.array(attention_mask_test)

# "Label" containts the ground truth for the YouTube comment data
label_test = df_test["Label"].values

# Creating a dictionary for YouTube comment data
test_data_np = {
    "input_ids": input_ids_array,
    "attention_mask": attention_mask_array
}

# Creating a TensorFlow dataset for the YouTube comment data
test_dataset = tf.data.Dataset.from_tensor_slices((test_data_np, label_test))

# Batching the YouTube comment data
test_dataset = test_dataset.batch(batch_size=32).prefetch(tf.data.AUTOTUNE)

In [10]:
from transformers import TFDistilBertModel

model_path = "../NLP_model/best_sentiment_model.keras"

# Loading the best model from local directory
loaded_model = tf.keras.models.load_model(
    model_path,
    custom_objects={"TFDistilBertModel": TFDistilBertModel}
)






In [11]:
loaded_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 103)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 103)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 103, 768)   0          'attention_mask[0][0]']      
                             , hidden_states=None, atte                                       

The fine-tuned DistilBERT model, specifically trained for binary sentiment classification, is then loaded, retaining its task-specific configuration. This enables the model to predict sentiment based on its training with similar data.

In the subsequent sections, the loaded DistilBERT model is applied to predict sentiment in the prepared YouTube comments. The analysis aims to reveal insights into the model's effectiveness in discerning sentiment within the context of user-generated content on the YouTube platform.

In [12]:
# Checking wheather the weights have been loaded correctly
for layer in loaded_model.layers:
    for weight in layer.weights:
        print(weight.name, weight.shape)

tf_distil_bert_model/distilbert/embeddings/word_embeddings/weight:0 (30522, 768)
tf_distil_bert_model/distilbert/embeddings/position_embeddings/embeddings:0 (512, 768)
tf_distil_bert_model/distilbert/embeddings/LayerNorm/gamma:0 (768,)
tf_distil_bert_model/distilbert/embeddings/LayerNorm/beta:0 (768,)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/q_lin/kernel:0 (768, 768)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/q_lin/bias:0 (768,)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/k_lin/kernel:0 (768, 768)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/k_lin/bias:0 (768,)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/v_lin/kernel:0 (768, 768)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/v_lin/bias:0 (768,)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/out_lin/kernel:0 (768, 768)
tf_distil_bert_model/distilbert/transformer/layer_._0/attention/out_lin/bias:0 (768

In [13]:
loaded_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
# Making predictions on the YouTube dataset
predictions_test = loaded_model.predict(test_dataset)

# Print logits for debugging
print("Logits for the YouTube dataset:", predictions_test)

# Converting logits to probabilities using softmax
probabilities_labels = tf.nn.softmax(predictions_test, axis=-1)

# Print probabilities for debugging
print("Probabilities for YouTube dataset:", probabilities_labels)

# Getting the predicted label values
predicted_labels = tf.argmax(probabilities_labels, axis=-1)

print("Predicted Labels for YouTube comment dataset:", predicted_labels.numpy())

In [16]:
test_loss, test_accuracy = loaded_model.evaluate(test_dataset, verbose=2)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")



5/5 - 5s - loss: 0.5232 - accuracy: 0.8658 - 5s/epoch - 1s/step
Test Loss: 0.5232, Test Accuracy: 0.8658


In [15]:
y, idx, count = tf.unique_with_counts(predicted_labels)

for label, count in zip(y.numpy(), count.numpy()):
    print(f"Label {label}: {count} occurences")

Label 0: 149 occurences
