# Topic Analysing Public Perception of Mr. Beast's Controversial Videos on YouTube Through Text Mining

## 1.Load libraries

In [None]:
import nltk
import re
import numpy as np
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string,emoji
from cleantext import clean
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

pd.set_option("mode.copy_on_write", False) 

## 2.Read data and select target columns

In [None]:
pd.options.mode.copy_on_write = True 
# read data into dataframe
Yt_comments = pd.read_csv(r'./YouTube_comments_20240616.txt',sep='\t')

# Select Target column  
selected_columns=['VideoID',
                  'CommentPublished',
                  'CommentTextDisplay',
                  'CommentAuthorName',
                  'CommentLikeCount']
Yt_comments = Yt_comments[selected_columns]

## 3. Data pre-processing

In [None]:
# Explore data structure 
Yt_comments

In [None]:
# Check data type and null value
print("Information of Data set")
Yt_comments.info()

# Check null value 
print(Yt_comments.isnull().sum())

# Remove missing value
Yt_comments = Yt_comments.dropna()

# Change data type 
Yt_comments['CommentPublished'] = pd.to_datetime(Yt_comments['CommentPublished'])
Yt_comments['CommentPublished_YM'] = Yt_comments['CommentPublished'].dt.strftime('%Y-%m')

### 3.1 Set filter on date

In [None]:
# Create visualisation about variations of comments from 2023-11 to 2024-06
gb_date = (Yt_comments
           .groupby('CommentPublished_YM')['CommentTextDisplay']
           .agg('count')
           .reset_index(name="count_comments"))

plt.figure(figsize=(10, 6))
plt.bar(gb_date['CommentPublished_YM'], 
        gb_date["count_comments"], 
        color='skyblue')

plt.title('Count of comments by year and month')
plt.xlabel('Year-month')
plt.ylabel('Count of comments')
plt.grid(axis='y')

for index, value in enumerate(gb_date["count_comments"]):
    plt.text(index, value + 0.5, str(value), ha='center')
plt.show()



# Create visualisation about variations of comments in 2023-11 
 
Yt_comments = Yt_comments[Yt_comments['CommentPublished_YM']=='2023-11'] 

Yt_comments['CommentPublished_D'] = Yt_comments['CommentPublished'].dt.strftime('%d')

gb_date_cms = (Yt_comments
               .groupby('CommentPublished_D')['CommentTextDisplay']
               .agg('count')
               .reset_index(name='count_comments')
               )

plt.figure(figsize=(10, 6))
plt.plot(gb_date_cms['CommentPublished_D'],gb_date_cms['count_comments'],marker='o', linestyle='--')
plt.title('Count of comments by day in Nov-2023')
plt.xlabel('day')
plt.ylabel('Count of comments')
plt.show()

### 3.2 Text Preprocessing 


In [None]:
# Define function to detect non-englis word from comments 
def is_english(text):
    try:
        return detect(str(text)) == 'en'
    except LangDetectException:
        return False
  
Yt_comments['is_eng'] = Yt_comments['CommentTextDisplay'].apply(is_english)

# Creatate visualisation to Check percentage of Non-English comments

eng_valid_cms = (Yt_comments
                 .groupby('is_eng')['CommentTextDisplay']
                 .agg('count')
                 .reset_index(name='count_comments')
                 )

eng_valid_cms['Percentage'] = (eng_valid_cms["count_comments"] / eng_valid_cms["count_comments"].sum()) * 100

plt.pie(
    eng_valid_cms['Percentage'],
    labels= eng_valid_cms['is_eng'], 
    autopct=lambda p: f'{p:.1f}%',
    startangle=140
    )

plt.show() 

# Remove Non-english comments
Yt_comments_eng_cms = Yt_comments[Yt_comments['is_eng']]
removed_num = Yt_comments['CommentTextDisplay'].count()-Yt_comments_eng_cms['CommentTextDisplay'].count()
print(f"Remove {removed_num} rows, left {Yt_comments_eng_cms['CommentTextDisplay'].count()} rows")

In [None]:
# Remove URL and user name 
def remove_urls_usernames(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove usernames (assuming they start with '@')
    text = re.sub(r'@\w+', '', text)  
    return text

# Converts text to lowercase.
def to_lowercase(text):    
    return text.lower()

# Removes punctuation from text.
def rm_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Removes emojis from text.
def remove_emoji(text):
    return clean(text, no_emoji=True)  

# Tokenizes text into words.
def tokenise(text):
    return word_tokenize(text)

#  Removes stop words.
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# Lemmatizes tokens
def lemmatize(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Rejoins tokens back into a string.
def rejoin_tokens(tokens):
    return ' '.join(tokens)

# Preprocesses pipeline for Topic modelling.
def preprocess_text_nmf(text):
    text = remove_urls_usernames(text)
    text = to_lowercase(text)
    text = rm_punctuation(text)
    text = remove_emoji(text)  
    tokens = tokenise(text)
    tokens = lemmatize(tokens)  
    tokens = remove_stopwords(tokens)
    return rejoin_tokens(tokens)

# Preprocesses pipeline for Vader.
def preprocess_text_vader(text):
    text = remove_urls_usernames(text)
    return text 
    

In [None]:
# pre_process text for NMF
Yt_comments_eng_cms['preprocessed_text_nmf'] = Yt_comments_eng_cms['CommentTextDisplay'].apply(preprocess_text_nmf)

# pre_process text for VADER
Yt_comments_eng_cms['preprocessed_text_vader'] = Yt_comments_eng_cms['CommentTextDisplay'].apply(preprocess_text_vader)

# Check there is no null value and space in comments after preprocessing 
Yt_comments_eng_cms = Yt_comments_eng_cms[(Yt_comments_eng_cms['preprocessed_text_nmf'].notnull())&
                                          (Yt_comments_eng_cms['preprocessed_text_nmf']!='')]


## 4. Data Analysis

### 4.1 Exploratory Data Analysis

In [None]:
# Create word cloud for frequent word in comments 
text = " ".join(Yt_comments_eng_cms['preprocessed_text_nmf']) 
wordcloud = WordCloud(max_words=20,
                      width=800, 
                      height=400, 
                      background_color='white', 
                      min_font_size=10).generate(text)

plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
from nltk import ngrams
from collections import Counter

# Combine text into a single string
all_text = " ".join(Yt_comments_eng_cms['preprocessed_text_nmf'])

# Tokenize and get bigrams
bigrams = list(ngrams(all_text.split(), 2))

# Count bigram frequencies
bigram_counts = Counter(bigrams)

# Sort bigrams by frequency (descending)
sorted_bigrams = sorted(
    bigram_counts.items(), 
    key=lambda item: item[1],
    reverse=True
    )

# Extract bigrams and frequencies for plotting
bigrams, frequencies = zip(*sorted_bigrams)

# Convert bigrams to strings for x-axis labels
bigrams = [" ".join(bigram) for bigram in bigrams]

# Create bar chart
plt.figure(figsize=(10, 5))
plt.bar(bigrams[0:15], frequencies[0:15])
plt.xlabel('Bigrams')
plt.ylabel('Frequency')
plt.title('Bigram Frequency Distribution')
plt.xticks(rotation=45)  
plt.show()

In [None]:
# Combine text into a single string
all_text = " ".join(Yt_comments_eng_cms['preprocessed_text_nmf'])

# Tokenize and get trigrams
trigrams = list(ngrams(all_text.split(), 3))

# Count trigram frequencies
trigram_counts = Counter(trigrams)

# Sort trigrams by frequency (descending)
sorted_trigrams = sorted(
    trigram_counts.items(),
    key=lambda item: item[1],
    reverse=True
    )
# Extract trigrams and frequencies for plotting
trigrams, frequencies = zip(*sorted_trigrams)

# Convert trigrams to strings for x-axis labels
trigrams = [" ".join(trigram) for trigram in trigrams]

# Create bar chart
plt.figure(figsize=(10, 5))
plt.bar(trigrams[0:10], frequencies[0:10])
plt.xlabel('Trigrams')
plt.ylabel('Frequency')
plt.title('Trigram Frequency Distribution')
plt.xticks(rotation=45)  
plt.show()

### 4.2 Research Question 1 What potential topics are discussed in the comments of Mr Beast's YouTube video, "I Built 100 Wells in Africa?

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
# find best number of topic 
# Get the TF-IDF matrix from your DataFrame
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, 
                                   min_df=5, 
                                   stop_words='english')

tfidf_matrix = tfidf_vectorizer.fit_transform(Yt_comments_eng_cms['preprocessed_text_nmf'])

# Define the range of topic numbers to explore
topic_nums = range(5, 15)  
coherence_values = []

# Iterate through topic numbers
for num_topics in topic_nums:

    # Train the NMF model
    nmf = NMF(n_components=num_topics,
               random_state=1)
    
    doc_topic = nmf.fit_transform(tfidf_matrix)

    # Get the word ids from the vectorizer
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    # Create a dictionary from the feature names
    dictionary = Dictionary([tfidf_feature_names])

    # Get top words per topic
    top_words = []
    for topic_idx, topic in enumerate(nmf.components_):
        top_words.append([tfidf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])
    
    # Calculate coherence (using 'c_v' measure)
    cm = CoherenceModel(topics=top_words, 
                        texts= Yt_comments_eng_cms['preprocessed_text_nmf'].apply(word_tokenize), 
                        coherence='c_v', 
                        dictionary=dictionary)
    
    coherence_values.append(cm.get_coherence())
    
# Plot results
plt.figure(figsize=(10, 5))
plt.plot(topic_nums, coherence_values)
plt.grid()
plt.xlabel("Number of Topics")
plt.ylabel("C_v Coherence Score")
plt.title("NMF Topic Coherence with C_v Measure")
plt.show()

In [None]:
# Vectorise the processed text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(Yt_comments_eng_cms['preprocessed_text_nmf'])

# Apply NMF for topic modeling
num_topics = 7

nmf_model = NMF(n_components= num_topics,
                 random_state=1)

nmf_topics = nmf_model.fit_transform(tfidf)

# Get the top words for each topic
def get_top_words(model, feature_names, n_top_words):
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
top_words = get_top_words(nmf_model, 
                          tfidf_feature_names, 
                          11)

# Print the top words for each topic
for i, words in enumerate(top_words):
    print(f"Topic {i}: {', '.join(words)}")

In [None]:
# Assign topics
Yt_comments_eng_cms['topic'] = nmf_topics.argmax(axis=1)

topic_annotation = {
    0:"People's Needs and Negative Reactions", 
    1:"Opinions about Mr Beast",
    2:"Moral Judgment of Actions",
    3:"Access to Clean Water",
    4:"Racism and Savior Complex",
    5:"MrBeast's Video and Money for Charity",
    6:"Sociopolitical Issues and Aid in Africa"
    }

Yt_comments_eng_cms['topic_tag'] = Yt_comments_eng_cms['topic'].map(topic_annotation)

In [None]:
#　Create visualisatiion for interpretation 
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-11 - 1:-1]]
    top_weights = [topic[i] for i in topic.argsort()[:-11 - 1:-1]]

    plt.figure(figsize=(6, 4))  # Adjust figure size as needed
    plt.barh(top_words, top_weights)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.title(f"{topic_annotation[topic_idx]}_(Topic {topic_idx})")
    plt.xlabel("Weights")

    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

In [None]:
comments_g_topic = Yt_comments_eng_cms['topic_tag'].value_counts()

plt.figure(figsize=(6, 4))
plt.barh(comments_g_topic.index, comments_g_topic.values,color='darkblue')
plt.title('Count of Comments by Topic')
plt.xlabel('Count of Comments')
plt.ylabel('Topic')
plt.grid(alpha=0.5)
plt.show()

### 4.3 Research Question 2 How does the range of audience sentiment vary across the key topics identified (RQ1) in the comments of Mr Beast's video? 

In [None]:
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return scores['compound']

def categorize_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [None]:
# Calculate sentiment score
Yt_comments_eng_cms['sentiment'] = Yt_comments_eng_cms['preprocessed_text_vader'].apply(analyze_sentiment)
# Assign sentiment category 
Yt_comments_eng_cms['sentiment_category'] = Yt_comments_eng_cms['sentiment'].apply(categorize_sentiment)

In [None]:
# Overall Sentiments in comments 
sentiment_counts = Yt_comments_eng_cms['sentiment_category'].value_counts()

plt.figure(figsize=(6, 4))
plt.bar(sentiment_counts.index, sentiment_counts.values,color='skyblue')
plt.title('The Results of Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count of Comments')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Avoid value error 
pd.set_option("mode.copy_on_write", False)
sentiment_PN = Yt_comments_eng_cms[Yt_comments_eng_cms['sentiment_category']!='neutral'][['sentiment_category','sentiment']].copy()
sentiment_PN['adj_sentiment'] = sentiment_PN['sentiment'].apply(abs)

# Plotting histogram
sns.histplot(data=sentiment_PN,
             x="adj_sentiment",
            hue="sentiment_category")
plt.xlabel('Sentiments Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiments')
plt.show()

In [None]:

topic_sentiment_counts =  Yt_comments_eng_cms[Yt_comments_eng_cms['sentiment_category']!='neutral'].groupby(['topic_tag', 'sentiment_category']).size().reset_index(name='counts').unstack()

plt.figure(figsize=(6, 4))  
sns.barplot(x='counts', 
            y='topic_tag', 
            hue='sentiment_category', 
            data=topic_sentiment_counts, 
            palette=['#ee854a','#4878d0'],orient = 'h')

plt.grid(alpha=0.5)
plt.xticks(rotation=70)
plt.xlabel('count of comments')
plt.ylabel('Topic')
plt.show()

### 4.4 Research Question 3 What does the distribution of topics and sentiments (RQ2) imply about the primary audience reactions to Mr Beast's videos?

In [None]:
# From KL divergence find best perplexity
from sklearn.manifold import TSNE
W = nmf_topics
perplexity = [50,100,200,300,400]
divergence = []
for i in perplexity:
    model = TSNE(n_components=2, init="pca", perplexity=i)
    reduced = model.fit_transform(W)
    divergence.append(model.kl_divergence_)
KL_record = pd.DataFrame({"perplexity":perplexity,'divergence':divergence})
KL_record.to_csv("./KL_record.csv")
KL_record = pd.read_csv("./KL_record.csv")
plt.plot(KL_record['perplexity'], KL_record['divergence'], marker='*',color='red')
plt.title('KL Divergence metric')
plt.grid()
plt.xlabel('Perplexity')
plt.ylabel('Divergence')
plt.show()

In [None]:
from sklearn.manifold import TSNE
W = nmf_topics

# 2. Apply t-SNE
tsne_200 = TSNE(n_components=2, 
                perplexity=200,
                random_state=1) 
 
W_tsne200 = tsne_200.fit_transform(W)

# 3. Create Scatter Plot with Seaborn
plt.figure(figsize=(8, 6))

df = pd.DataFrame(dict(x=W_tsne200[:,0],
                       y=W_tsne200[:,1],
                       label=np.argmax(W, axis=1)))

groups = df.groupby('label')

# Plot each group in a different color
for name, group in groups:
    plt.plot(group.x, 
             group.y,marker='.',
             markersize=2,linestyle='', label=name)

# Add topic labels
xy_1 = 80
xy_2 = (-15)
for i, topic_num in enumerate(np.unique(np.argmax(W, axis=1))):
    x = W_tsne200[np.argmax(W, axis=1) == topic_num, 0].mean()
    y = W_tsne200[np.argmax(W, axis=1) == topic_num, 1].mean()
    plt.text(x, y, f"{topic_num}", fontsize=15,bbox = dict(facecolor = 'yellow', alpha = 0.8))
    xy_2 -= 5 
    plt.annotate(f' ({topic_num}) {topic_annotation[topic_num]}',xy=[xy_1,xy_2])
# Add legend and titles
plt.legend(title='Topics', bbox_to_anchor=(1.15, 1), loc='upper right',markerscale=12)
plt.annotate(' Topic:',xy=[xy_1,-15])
plt.title('t-SNE Visualisation of NMF Topics')
plt.grid()
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()