# <font color="red"> ACCESSING YT API | YT ID | RETRIEVE COMMENTS


In [None]:
# %%capture
# !pip install -r requirements.txt

In [2]:
# Filter out warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
from googleapiclient.discovery import build
import os

API_KEY = os.getenv("API_KEY")
youtube = build('youtube', 'v3', developerKey=API_KEY)

In [4]:
import numpy as np
import pandas as pd
import requests

In [11]:
VIDEO_IDs = ['XqgMWVZvi7w','5530I_pYjbo','P6FORpg0KVo','iNyUmbmQQZg','TjPFZaMe2yw']

video_dict = {}
max_results = 100

def get_video_details(video_dict, max_results):
    for video_id in VIDEO_IDs:
        params = {
            'part': 'snippet',
            'videoId': video_id,
            'maxResults': max_results,  # Max per page (use pagination to get more)
            'textFormat': 'plainText',
            'key': API_KEY
        }

        response = requests.get('https://www.googleapis.com/youtube/v3/commentThreads', params=params)
        data = response.json()

        # Add error handling to check if the request was successful and if 'items' exists
        if response.status_code == 200 and 'items' in data:
            def get_comments(data):
                comments = []
                for d in data['items']:
                    comments.append(d['snippet']['topLevelComment']['snippet']['textDisplay'])
                return comments

            comments = get_comments(data)
            video_dict[video_id] = comments
        else:
            print(f"Error for video {video_id}: {data.get('error', {}).get('message', 'Unknown error')}")
            pass

get_video_details(video_dict, max_results)

# Remove videos where the number of comments is less than max_results
keys_to_remove = [k for k, v in video_dict.items() if len(v) != max_results]
for k in keys_to_remove:
    video_dict.pop(k)


In [15]:
record = []

for video_id, comments in video_dict.items():
    for indx, comment in enumerate(comments):
        record.append({
            "video_id": video_id,
            "index": indx,
            "comment": comment,
            "Sentiment score": None
        })

df = pd.DataFrame(record)
df.head(20)

Unnamed: 0,video_id,index,comment,Sentiment score
0,XqgMWVZvi7w,0,"–ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ, –º—ã –Ω–∞—à–ª–∏ –æ—Ç–ª–∏—á–Ω—ã–π –≤—ã–±–æ—Ä Audi R8 ...",
1,XqgMWVZvi7w,1,why do you want to feel the shift?,
2,XqgMWVZvi7w,2,,
3,XqgMWVZvi7w,3,in 7 gear the audi speed limiter got slipped,
4,XqgMWVZvi7w,4,rip r8,
5,XqgMWVZvi7w,5,11:21 6 or 7,
6,XqgMWVZvi7w,6,Ese sonido ü•∞ü•∞ü•∞ü•∞,
7,XqgMWVZvi7w,7,That bassey v10 on this GT is perfect!!!! Soun...,
8,XqgMWVZvi7w,8,I never knew why audo only built these at 600h...,
9,XqgMWVZvi7w,9,Audi has killed itself.,


# <font color="red"> PRE-PROCESSING DATA


In [None]:
df1 = df.copy()

# TEXT PREPROCESSING FUNCTION

import contractions
import re

def preprocess_comment(text):
    if not isinstance(text, str):
        return ""
    
    # Expand contractions (don't -> do not)
    text = contractions.fix(text)

    # Convert to lower case
    # text = text.lower()

    # Remove URLS
    text = re.sub(r'https?://S+|www\.\S+', '', text)

    # Clean repeated/excessive punchuations
    text = re.sub(fr'([!?.]){2,}', r'\1', text)

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # NEVER REMOVE: 
    # - Stopwords (transformers need full context)
    # - Emojis (they carry sentiment)
    # - Negations (critical for sentiment)
    return text


# APPLY PREPROCESSING
df1['comment'] = df1['comment'].apply(preprocess_comment)
print(f"Processed {len(df1)} comments")
df1.head(20)


Processed 400 comments


Unnamed: 0,video_id,index,comment,Sentiment score
0,XqgMWVZvi7w,0,"–∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ, –º—ã –Ω–∞—à–ª–∏ –æ—Ç–ª–∏—á–Ω—ã–π –≤—ã–±–æ—Ä audi r8 ...",
1,XqgMWVZvi7w,1,why do you want to feel the shift?,
2,XqgMWVZvi7w,2,,
3,XqgMWVZvi7w,3,in 7 gear the audi speed limiter got slipped,
4,XqgMWVZvi7w,4,rip r8,
5,XqgMWVZvi7w,5,11:21 6 or 7,
6,XqgMWVZvi7w,6,ese sonido ü•∞ü•∞ü•∞ü•∞,
7,XqgMWVZvi7w,7,that bassey v10 on this gt is perfect!!!! soun...,
8,XqgMWVZvi7w,8,i never knew why audo only built these at 600h...,
9,XqgMWVZvi7w,9,audi has killed itself.,


In [20]:
# FILTER ONLY ENGLISH COMMENTS

# from langdetect import detect, DetectorFactory
# DetectorFactory.seed = 0 # Make it determinstic

# def is_english(text):
#     try:
#         return detect(text) == "en"
#     except:
#         return False

# df1['is_english'] = df1['comment'].apply(is_english)
# english_df = df1[df1['is_english']]
# skipped = len(df1) - len(english_df)
# print(f"Kept {len(english_df)} English comments, skipped {skipped} non-English comments.")
# df1.head(20)



# # FILTER OUT NON-ENGLISH COMMENTS
# def filter_non_english_words_keep_emojis(text):
#     # Emoji unicode range
#     emoji_ranges = (
#         "\U0001F600-\U0001F64F"  # Emoticons
#         "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
#         "\U0001F680-\U0001F6FF"  # Transport & Map
#         "\U0001F1E0-\U0001F1FF"  # Flags
#         "\U00002700-\U000027BF"  # Dingbats
#         "\U0001F900-\U0001F9FF"  # Supplemental Symbols
#         "\U00002600-\U000026FF"  # Misc symbols
#     )

#     # Pattern to keep English words and emojis
#     emoji_pattern = f"[{emoji_ranges}]"
#     english_word_pattern = r"\b[a-zA-Z]+\b"

#     # Find all English words and emojis
#     english_words = re.findall(english_word_pattern, text)
#     emojis = re.findall(emoji_pattern, text)

#     return ' '.join(english_words + emojis)

# df['comment'] = df['comment'].map(lambda x: filter_non_english_words_keep_emojis(x) if isinstance(x, str) else x)
# # df

### <font color="aqua"> 4. TOKENIZE COMMENTS

In [16]:
# import nltk
# nltk.download('punkt_tab')
# from nltk.tokenize import word_tokenize

In [None]:
# df["comment"] = df["comment"].apply(lambda x: word_tokenize(x) if isinstance(x, str) else x)

### <font color="aqua"> 5. REMOVE STOPWORDS

In [None]:
# from nltk.corpus import stopwords
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# tmp = df.copy()
# negations = {
#     "not", "nor", "no",
#     "do", "are", "cannot", "could", "did", "does",
#     "had", "has", "have", "is", "might", "must",
#     "need", "should", "was", "were", "would", "can"
# }
# stop_words = set(stopwords.words('english'))

# stop_words = [word for word in stop_words if word not in negations]

# # df["comment"] = df["comment"].map(lambda x: [word for word in x if word not in stop_words] if isinstance(x, list) else x)
# tmp = tmp.map(lambda x: [word for word in x if word not in stop_words] if isinstance(x, list) else x)

# df = tmp.copy()

# <font color="red">Build/Train the Model
## (Use BERT and HuggingFace or VADAR), GPT say BERT is more impressive


### <font color="cyan"> 1. LOADING PRE-TRAINED MODEL FROM HUGGINGFACE

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = TFAutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

In [None]:
df = df.map(lambda x: " ".join(x) if isinstance(x, list) else x)
df

Unnamed: 0,video_id,index,comment,Sentiment score
0,XqgMWVZvi7w,0,rev counter can not catch gearbox work think i...,
1,XqgMWVZvi7w,1,do not understand do not think gt is rwd,
2,XqgMWVZvi7w,2,camera angle is unfortunately bad although und...,
3,XqgMWVZvi7w,3,god car,
4,XqgMWVZvi7w,4,dream ‚ù§,
...,...,...,...,...
395,TjPFZaMe2yw,95,discuss secret group sucess,
396,TjPFZaMe2yw,96,also stay sober do not go benders disrupts bra...,
397,TjPFZaMe2yw,97,medhya churna planet ayurveda can help enhanci...,
398,TjPFZaMe2yw,98,hehe gave hehehehehehehehehehehehehehe,


In [None]:
tokenized = tokenizer(
    df["comment"].tolist(),
    return_tensors="tf",
    padding=True,
    truncation=True,
    max_length=128
)
tokenized

{'input_ids': <tf.Tensor: shape=(400, 128), dtype=int32, numpy=
array([[    0, 23559,  3231, ...,     1,     1,     1],
       [    0,  5016,    45, ...,     1,     1,     1],
       [    0, 25092, 11792, ...,     1,     1,     1],
       ...,
       [    0,  4567, 18321, ...,     1,     1,     1],
       [    0,   700,   700, ...,     1,     1,     1],
       [    0, 12805,   225, ...,     1,     1,     1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(400, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [None]:
# # Get model outputs
# outputs = model(tokenized)
# outputs

dataset = tf.data.Dataset.from_tensor_slices(tokenized).batch(32) # Choose a batch size that fits in memory

predictions = []
confidences = []

# Iterate over the dataset in batches and get predictions
for batch in dataset:
    outputs = model(batch)
    prediction = tf.nn.softmax(outputs.logits, axis=1)
    predictions.append(prediction.numpy())

# Concatenate the predictions from all batches
predictions = np.concatenate(predictions, axis=0)

# Get the predicted classes and confidence scores
pred_classes = tf.argmax(predictions, axis=1).numpy()
labels = ['negative', 'neutral', 'positive']
pred_labels = [labels[i] for i in pred_classes]
confidences = predictions.max(axis=1)

# %%
results_df = pd.DataFrame({
    "comment": df["comment"],  # Convert first row of df to list
    "sentiment": pred_labels,
    "confidence": confidences
})
results_df

Unnamed: 0,comment,sentiment,confidence
0,rev counter can not catch gearbox work think i...,negative,0.377174
1,do not understand do not think gt is rwd,negative,0.384964
2,camera angle is unfortunately bad although und...,negative,0.365956
3,god car,negative,0.388468
4,dream ‚ù§,negative,0.406690
...,...,...,...
395,discuss secret group sucess,positive,0.385340
396,also stay sober do not go benders disrupts bra...,negative,0.388172
397,medhya churna planet ayurveda can help enhanci...,negative,0.389105
398,hehe gave hehehehehehehehehehehehehehe,positive,0.389801


In [None]:
model.save_pretrained("roberta_web_model")
tokenizer.save_pretrained("roberta_web_model")

('roberta_web_model/tokenizer_config.json',
 'roberta_web_model/special_tokens_map.json',
 'roberta_web_model/vocab.json',
 'roberta_web_model/merges.txt',
 'roberta_web_model/added_tokens.json',
 'roberta_web_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# prompt: how to save the model to google drive

import os

# Define the path to save the model in your Google Drive
model_save_path = '/content/drive/MyDrive/my_sentiment_model'

# Create the directory if it doesn't exist
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

# Save the TensorFlow model
model.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Model and tokenizer saved to /content/drive/MyDrive/my_sentiment_model


In [None]:
import shutil
# shutil.make_archive('/content/drive/MyDrive/my_sentiment_model', 'zip', '/content/roberta_web_model')
shutil.make_archive("roberta_web_model", 'zip', "roberta_web_model")


'/content/roberta_web_model.zip'

In [None]:
import streamlit as st

st.title("YouTube Sentiment Analyzer")
url = st.text_input("Enter YouTube URL")

if st.button("Fetch Comments"):
    video_id = get_video_details(url)
    if video_id:
        with st.spinner("Fetching comments..."):
            comments = get_video_details(video_id)
            st.success(f"Fetched {len(comments)} comments.")
            for c in comments:
                st.write(f"- {c}")
    else:
        st.error("Invalid YouTube URL")
