Importing necessary libraries

In [1]:
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd
import re
import string

# Import API key from config.py
from config import API_KEY

Setting up the YouTube API

In [2]:
api_service_name = "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=API_KEY)

# Request video comments and store in dataframe
request = youtube.commentThreads().list(
    part="snippet",
    videoId="pL064KP46UE",
    maxResults=100
)
response = request.execute()

comments = []

# Loop through response and store comments in list. Retrieves the snippet of the top level comment and the text display
# top level comment is the comment that is not a reply to another comment
for item in response['items']:
    comment_text = item['snippet']['topLevelComment']['snippet']['textDisplay']
    comments.append(comment_text)

df = pd.DataFrame(comments, columns=['comment'])

# df.to_csv('comments.csv', index=False)

df.head(100)

Unnamed: 0,comment
0,Test your Office know-how with The Office Holi...
1,&quot;I threw the keys in out of anger&quot; <...
2,phyllis snorting &amp; dwight opening the door...
3,Tho half of this actually made the final cut ..
4,"<a href=""https://www.youtube.com/watch?v=pL064..."
...,...
95,I threw my keys in out of anger
96,When Creed says does it have to be clean..... ...
97,"<a href=""https://www.youtube.com/watch?v=pL064..."
98,That toilet mesh invention needs to be a real ...


In [3]:
# data type of the dataframe
print(df.dtypes)

comment    object
dtype: object


Downloading resources and setting up the model

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


False

Preprocessing the data

In [6]:
def preprocess_comment(comment):
    # Remove HTML tags
    comment = re.sub(r'<.*?>', '', comment)

    # Replace HTML character codes with ASCII equivalent
    comment = re.sub(r'&amp;', '&', comment)
    comment = re.sub(r'&quot;', '"', comment)
    comment = re.sub(r'&#39;', "'", comment)

    # Convert text to lowercase
    comment = comment.lower()

    # Remove URLs
    comment = re.sub(r'http\S+', '', comment)

    # Remove punctuation (optional: consider keeping emoticons)
    comment = comment.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(comment)
    filtered_comment = [word for word in word_tokens if word not in stop_words]

    return ' '.join(filtered_comment)

In [7]:
# Apply preprocessing to each comment
df['processed_comment'] = df['comment'].apply(preprocess_comment)

Defining and applying sentiment analysis function

In [8]:
def analyze_sentiment_vader(comment):
    sia = SentimentIntensityAnalyzer()
    score = sia.polarity_scores(comment)
    compound = score['compound']
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'


Collating the results and displaying the dataframe

In [9]:
# Apply sentiment analysis on the processed comments
df['sentiment'] = df['processed_comment'].apply(analyze_sentiment_vader)

# Display the DataFrame
display(df.head(10))

Unnamed: 0,comment,processed_comment,sentiment
0,Test your Office know-how with The Office Holi...,test office knowhow office holiday quiz premie...,Positive
1,&quot;I threw the keys in out of anger&quot; <...,threw keys anger killed 😂,Negative
2,phyllis snorting &amp; dwight opening the door...,phyllis snorting dwight opening door w butt to...,Neutral
3,Tho half of this actually made the final cut ..,tho half actually made final cut,Negative
4,"<a href=""https://www.youtube.com/watch?v=pL064...",141 love dwight exhibits objects like art gall...,Positive
5,what do you mean so we don&#39;t have to?..the...,mean dont tothese good,Negative
6,How is Michael more mature in the deleted scenes,michael mature deleted scenes,Positive
7,Why were these scenes deleted is all I want to...,scenes deleted want knowanyone,Positive
8,Not Michael calling Pam pervy.. 😭😂,michael calling pam pervy 😭😂,Neutral
9,"I love the self awareness @<a href=""https://ww...",love self awareness 412 🤣,Positive
