# Topic Modeling using LDA

## Importing Libaries

In [None]:
import re
import string
import nltk
import spacy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

%matplotlib inline
plt.style.use('ggplot')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
SPECIFIC_VIDEO_COMMENTS_DATASET_FILE = "/content/drive/MyDrive/NLP-Powered-YouTube-Analytics/specific_video_comments.csv"

df = pd.read_csv(SPECIFIC_VIDEO_COMMENTS_DATASET_FILE)
print("Shape of Dataframe: ", df.shape)
df.head()

Shape of Dataframe:  (112, 1)


Unnamed: 0,comments
0,"I have a very good command of CSS, thanks to t..."
1,UNREAL!!!!
2,This is my first time of leaving a comment on ...
3,"I attest to the quality of Bob's teaching, I u..."
4,Hands down the BEST react tutorial I have ever...


## Preprocess the data

### **Install  Required Libraries**

In [None]:
!pip install langdetect



In [None]:
!pip install emoji



In [None]:
!pip install tqdm



### Preprocessing Class

In [None]:
import pandas as pd
import re
import string
import emoji
import spacy
from tqdm import tqdm
from langdetect import detect
from contractions import fix
from nltk.corpus import stopwords

class TextPreprocessor:
    def __init__(self, df):
        self.df = df

    # Add tqdm to show progress
    def tqdm_apply(self, series, func, func_name):
        total = len(series)
        with tqdm(total=total, desc=func_name) as pbar:
            for item in series:
                yield func(item)
                pbar.update(1)

    # Function to detect language
    def detect_language(self, text):
        try:
            return detect(text)
        except:
            return 'unknown'

    # Function to remove HTML tags
    def remove_html_tags(self, text):
        pattern = re.compile('<.*?>')
        return pattern.sub(r'', text)

    # Function to remove URLs
    def remove_url(self, text):
        pattern = re.compile(r'https?://\S+|www\.\S+')
        return pattern.sub(r'', text)

    # Function to remove newlines
    def remove_newlines(self, text):
        return text.replace('\n', ' ')

    # Function to remove dates
    def remove_dates(self, text):
        date_pattern = r'\b\d{4}-\d{2}-\d{2}\b|\b\d{2}/\d{2}/\d{2}\b|\b\d{2}/\d{2}/\d{4}\b|\b\d{1,2} (?:january|february|march|april|may|june|july|august|september|october|november|december) \d{4}\b'
        return re.sub(date_pattern, '', text)

    # Function to convert emojis to text
    def convert_emojis_to_text(self, text):
        text = emoji.demojize(text).split(":")
        text = " ".join(text)
        text = re.sub(r'\s+', ' ', text)
        return text

    # Function to convert emoticons to text
    def convert_emoticons_to_text(self, text):
        EMOTICONS = {
            u":‚Äë\)": "Happy face smiley",
            u":\)": "Happy face smiley",
            u":-\]": "Happy face smiley",
            u":\]": "Happy face smiley",
            u":-3": "Happy face smiley",
            u":3": "Happy face smiley",
            u":->": "Happy face smiley",
            u":>": "Happy face smiley",
            u"8-\)": "Happy face smiley",
            u":o\)": "Happy face smiley",
            u":-\}": "Happy face smiley",
            u":\}": "Happy face smiley",
            u":-\)": "Happy face smiley",
            u":c\)": "Happy face smiley",
            u":\^\)": "Happy face smiley",
            u"=\]": "Happy face smiley",
            u"=\)": "Happy face smiley",
            u":‚ÄëD": "Laughing, big grin or laugh with glasses",
            u":D": "Laughing, big grin or laugh with glasses",
            u"8‚ÄëD": "Laughing, big grin or laugh with glasses",
            u"8D": "Laughing, big grin or laugh with glasses",
            u"X‚ÄëD": "Laughing, big grin or laugh with glasses",
            u"XD": "Laughing, big grin or laugh with glasses",
            u"=D": "Laughing, big grin or laugh with glasses",
            u"=3": "Laughing, big grin or laugh with glasses",
            u"B\^D": "Laughing, big grin or laugh with glasses",
            u":-\)\)": "Very happy",
            u":‚Äë\(": "Frown, sad, andry or pouting",
            u":-\(": "Frown, sad, andry or pouting",
            u":\(": "Frown, sad, andry or pouting",
            u":‚Äëc": "Frown, sad, andry or pouting",
            u":c": "Frown, sad, andry or pouting",
            u":‚Äë<": "Frown, sad, andry or pouting",
            u":<": "Frown, sad, andry or pouting",
            u":‚Äë\[": "Frown, sad, andry or pouting",
            u":\[": "Frown, sad, andry or pouting",
            u":-\|\|": "Frown, sad, andry or pouting",
            u">:\[": "Frown, sad, andry or pouting",
            u":\{": "Frown, sad, andry or pouting",
            u":@": "Frown, sad, andry or pouting",
            u">:\(": "Frown, sad, andry or pouting",
            u":'‚Äë\(": "Crying",
            u":'\(": "Crying",
            u":'‚Äë\)": "Tears of happiness",
            u":'\)": "Tears of happiness",
            u"D‚Äë':": "Horror",
            u"D:<": "Disgust",
            u"D:": "Sadness",
            u"D8": "Great dismay",
            u"D;": "Great dismay",
            u"D=": "Great dismay",
            u"DX": "Great dismay",
            u":‚ÄëO": "Surprise",
            u":O": "Surprise",
            u":‚Äëo": "Surprise",
            u":o": "Surprise",
            u":-0": "Shock",
            u"8‚Äë0": "Yawn",
            u">:O": "Yawn",
            u":-\*": "Kiss",
            u":\*": "Kiss",
            u":X": "Kiss",
            u";‚Äë\)": "Wink or smirk",
            u";\)": "Wink or smirk",
            u"\*-\)": "Wink or smirk",
            u"\*\)": "Wink or smirk",
            u";‚Äë\]": "Wink or smirk",
            u";\]": "Wink or smirk",
            u";\^\)": "Wink or smirk",
            u":‚Äë,": "Wink or smirk",
            u";D": "Wink or smirk",
            u":‚ÄëP": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u":P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u"X‚ÄëP": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u"XP": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u":‚Äë√û": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u":√û": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u":b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u"d:": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u"=p": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u">:P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u":‚Äë/": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u":/": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u":-[.]": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u">:[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u">:/": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u":[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u"=/": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u"=[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u":L": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u"=L": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u":S": "Skeptical, annoyed, undecided, uneasy or hesitant",
            u":‚Äë\|": "Straight face",
            u":\|": "Straight face",
            u":$": "Embarrassed or blushing",
            u":‚Äëx": "Sealed lips or wearing braces or tongue-tied",
            u":x": "Sealed lips or wearing braces or tongue-tied",
            u":‚Äë#": "Sealed lips or wearing braces or tongue-tied",
            u":#": "Sealed lips or wearing braces or tongue-tied",
            u":‚Äë&": "Sealed lips or wearing braces or tongue-tied",
            u":&": "Sealed lips or wearing braces or tongue-tied",
            u"O:‚Äë\)": "Angel, saint or innocent",
            u"O:\)": "Angel, saint or innocent",
            u"0:‚Äë3": "Angel, saint or innocent",
            u"0:3": "Angel, saint or innocent",
            u"0:‚Äë\)": "Angel, saint or innocent",
            u"0:\)": "Angel, saint or innocent",
            u":‚Äëb": "Tongue sticking out, cheeky, playful or blowing a raspberry",
            u"0;\^\)": "Angel, saint or innocent",
            u">:‚Äë\)": "Evil or devilish",
            u">:\)": "Evil or devilish",
            u"\}:‚Äë\)": "Evil or devilish",
            u"\}:\)": "Evil or devilish",
            u"3:‚Äë\)": "Evil or devilish",
            u"3:\)": "Evil or devilish",
            u">;\)": "Evil or devilish",
            u"\|;‚Äë\)": "Cool",
            u"\|‚ÄëO": "Bored",
            u":‚ÄëJ": "Tongue-in-cheek",
            u"#‚Äë\)": "Party all night",
            u"%‚Äë\)": "Drunk or confused",
            u"%\)": "Drunk or confused",
            u":-###..": "Being sick",
            u":###..": "Being sick",
            u"<:‚Äë\|": "Dump",
            u"\(>_<\)": "Troubled",
            u"\(>_<\)>": "Troubled",
            u"\(';'\)": "Baby",
            u"\(\^\^>``": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
            u"\(\^_\^;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
            u"\(-_-;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
            u"\(~_~;\) \(„Éª\.„Éª;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
            u"\(-_-\)zzz": "Sleeping",
            u"\(\^_-\)": "Wink",
            u"\(\(\+_\+\)\)": "Confused",
            u"\(\+o\+\)": "Confused",
            u"\(o\|o\)": "Ultraman",
            u"\^_\^": "Joyful",
            u"\(\^_\^\)/": "Joyful",
            u"\(\^O\^\)Ôºè": "Joyful",
            u"\(\^o\^\)Ôºè": "Joyful",
            u"\(__\)": "Kowtow as a sign of respect, or dogeza for apology",
            u"_\(\._\.\)_": "Kowtow as a sign of respect, or dogeza for apology",
            u"<\(_ _\)>": "Kowtow as a sign of respect, or dogeza for apology",
            u"<m\(__\)m>": "Kowtow as a sign of respect, or dogeza for apology",
            u"m\(__\)m": "Kowtow as a sign of respect, or dogeza for apology",
            u"m\(_ _\)m": "Kowtow as a sign of respect, or dogeza for apology",
            u"\('_'\)": "Sad or Crying",
            u"\(/_;\)": "Sad or Crying",
            u"\(T_T\) \(;_;\)": "Sad or Crying",
            u"\(;_;": "Sad of Crying",
            u"\(;_:\)": "Sad or Crying",
            u"\(;O;\)": "Sad or Crying",
            u"\(:_;\)": "Sad or Crying",
            u"\(ToT\)": "Sad or Crying",
            u";_;": "Sad or Crying",
            u";-;": "Sad or Crying",
            u";n;": "Sad or Crying",
            u";;": "Sad or Crying",
            u"Q\.Q": "Sad or Crying",
            u"T\.T": "Sad or Crying",
            u"QQ": "Sad or Crying",
            u"Q_Q": "Sad or Crying",
            u"\(-\.-\)": "Shame",
            u"\(-_-\)": "Shame",
            u"\(‰∏Ä‰∏Ä\)": "Shame",
            u"\(Ôºõ‰∏Ä_‰∏Ä\)": "Shame",
            u"\(=_=\)": "Tired",
            u"\(=\^\¬∑\^=\)": "cat",
            u"\(=\^\¬∑\¬∑\^=\)": "cat",
            u"=_\^=	": "cat",
            u"\(\.\.\)": "Looking down",
            u"\(\._\.\)": "Looking down",
            u"\^m\^": "Giggling with hand covering mouth",
            u"\(\„Éª\„Éª?": "Confusion",
            u"\(?_?\)": "Confusion",
            u">\^_\^<": "Normal Laugh",
            u"<\^!\^>": "Normal Laugh",
            u"\^/\^": "Normal Laugh",
            u"\Ôºà\*\^_\^\*Ôºâ": "Normal Laugh",
            u"\(\^<\^\) \(\^\.\^\)": "Normal Laugh",
            u"\(^\^\)": "Normal Laugh",
            u"\(\^\.\^\)": "Normal Laugh",
            u"\(\^_\^\.\)": "Normal Laugh",
            u"\(\^_\^\)": "Normal Laugh",
            u"\(\^\^\)": "Normal Laugh",
            u"\(\^J\^\)": "Normal Laugh",
            u"\(\*\^\.\^\*\)": "Normal Laugh",
            u"\(\^‚Äî\^\Ôºâ": "Normal Laugh",
            u"\(#\^\.\^#\)": "Normal Laugh",
            u"\Ôºà\^‚Äî\^\Ôºâ": "Waving",
            u"\(;_;\)/~~~": "Waving",
            u"\(\^\.\^\)/~~~": "Waving",
            u"\(-_-\)/~~~ \($\¬∑\¬∑\)/~~~": "Waving",
            u"\(T_T\)/~~~": "Waving",
            u"\(ToT\)/~~~": "Waving",
            u"\(\*\^0\^\*\)": "Excited",
            u"\(\*_\*\)": "Amazed",
            u"\(\*_\*;": "Amazed",
            u"\(\+_\+\) \(@_@\)": "Amazed",
            u"\(\*\^\^\)v": "Laughing,Cheerful",
            u"\(\^_\^\)v": "Laughing,Cheerful",
            u"\(\(d[-_-]b\)\)": "Headphones,Listening to music",
            u'\(-"-\)': "Worried",
            u"\(„Éº„Éº;\)": "Worried",
            u"\(\^0_0\^\)": "Eyeglasses",
            u"\(\ÔºæÔΩñ\Ôºæ\)": "Happy",
            u"\(\ÔºæÔΩï\Ôºæ\)": "Happy",
            u"\(\^\)o\(\^\)": "Happy",
            u"\(\^O\^\)": "Happy",
            u"\(\^o\^\)": "Happy",
            u"\)\^o\^\(": "Happy",
            u":O o_O": "Surprised",
            u"o_0": "Surprised",
            u"o\.O": "Surpised",
            u"\(o\.o\)": "Surprised",
            u"oO": "Surprised",
            u"\(\*Ôø£mÔø£\)": "Dissatisfied",
            u"\(‚ÄòA`\)": "Snubbed or Deflated"
        }
        for emoticon, text_rep in EMOTICONS.items():
            text = re.sub(emoticon, text_rep, text)
        return text

    # Function to expand contractions
    def expand_contractions(self, text):
        return fix(text)

    # Function to remove punctuations
    def remove_punctuation(self, text):
        return text.translate(str.maketrans('', '', string.punctuation))

    # Function to remove stopwords
    def remove_stopwords(self, text):
        stop_words = set(stopwords.words('english'))
        return ' '.join(word for word in text.split() if word.lower() not in stop_words)

    # Function to perform lemmatization
    def lemmatize_text(self, text):
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        lemmas = [token.lemma_ for token in doc]
        return ' '.join(lemmas)

    # Function to preprocess the text
    def preprocess_text(self):
        # Detect language
        self.df['detected_language'] = list(self.tqdm_apply(self.df['comments'], self.detect_language, 'Language Detection'))

        # Filter rows where language is English
        self.df = self.df[self.df['detected_language'] == 'en']

        # Perform text preprocessing steps
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.remove_html_tags, 'Removing HTML Tags'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.remove_url, 'Removing URLs'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.remove_newlines, 'Removing Newlines'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.remove_dates, 'Removing Dates'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.convert_emojis_to_text, 'Converting Emojis to Text'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.convert_emoticons_to_text, 'Converting Emoticons to Text'))
        self.df['comments'] = self.df['comments'].str.lower()
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.expand_contractions, 'Expanding Contractions'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.remove_punctuation, 'Removing Punctuation'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.remove_stopwords, 'Removing Stopwords'))
        self.df['comments'] = list(self.tqdm_apply(self.df['comments'], self.lemmatize_text, 'Lemmatization'))

        return self.df


In [None]:
preprocessor = TextPreprocessor(df)
preprocessed_df = preprocessor.preprocess_text()
preprocessed_df.to_csv('preprocessed_df.csv', index=False)

Language Detection: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 111/111 [00:00<00:00, 142.65it/s]
Removing HTML Tags: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 104438.86it/s]
Removing URLs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 160446.13it/s]
Removing Newlines: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 353415.12it/s]
Removing Dates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 81323.62it/s]
Converting Emojis to Text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 3874.11it/s]
Converting Emoticons to Text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 4257.44it/s]
Expanding Contractions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 41132.04it/s]
Removing Punctuation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 98623.83it/s]
Removing Stopwords: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 3573.72it/s]
Lemmatization: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106

Unnamed: 0,comments,detected_language
0,good command css thank course react knowledge ...,en
3,attest quality bob teach use original course s...,en
4,hand well react tutorial ever,en
5,ultimately boil teaching method bob use,en
6,love way scrimba platform set,en


In [None]:
df = preprocessed_df.copy()
df.head()

Unnamed: 0,comments,detected_language
0,good command css thank course react knowledge ...,en
3,attest quality bob teach use original course s...,en
4,hand well react tutorial ever,en
5,ultimately boil teaching method bob use,en
6,love way scrimba platform set,en


## LDA Modeling via Genism

In [None]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(df['comments'].values.tolist()))

# data_words

## Trying LDA with Unigrams

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(1, 1), (9, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(8, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(13, 1), (22, 1), (27, 1), (28, 1), (29, 1), (30, 1)], [(18, 1), (31, 1), (32, 1), (33, 1), (34, 1)], [(18, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)], [(21, 2), (29, 1), (31, 1), (34, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)], [(1, 1), (4, 1), (9, 1), (24, 1), (44, 1), (45, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1)], [(1, 1), (8, 1), (21, 1), (34, 1), (60, 1), (61, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1)], [(9, 1), (14, 1), (21, 1), (31, 1), (34, 1), (45, 1), (80, 1), (81, 1), 

In [None]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 1 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]



[(0,
  '0.031*"good" + 0.031*"course" + 0.027*"tutorial" + 0.026*"react" + '
  '0.023*"amazing" + 0.018*"project" + 0.018*"thank" + 0.017*"really" + '
  '0.015*"teacher" + 0.015*"perfect"'),
 (1,
  '0.033*"course" + 0.023*"teach" + 0.022*"great" + 0.016*"complete" + '
  '0.016*"amazing" + 0.016*"love" + 0.016*"way" + 0.016*"teacher" + '
  '0.016*"wish" + 0.015*"bob"'),
 (2,
  '0.038*"course" + 0.028*"good" + 0.028*"react" + 0.021*"thank" + 0.021*"bob" '
  '+ 0.021*"one" + 0.013*"learn" + 0.012*"teach" + 0.012*"scrimba" + '
  '0.012*"finish"'),
 (3,
  '0.052*"course" + 0.049*"react" + 0.032*"good" + 0.029*"bob" + 0.016*"thank" '
  '+ 0.015*"tutorial" + 0.015*"learn" + 0.015*"work" + 0.015*"really" + '
  '0.012*"help"'),
 (4,
  '0.049*"course" + 0.037*"thank" + 0.030*"react" + 0.026*"much" + 0.020*"bob" '
  '+ 0.019*"tutorial" + 0.016*"teach" + 0.014*"want" + 0.014*"skill" + '
  '0.013*"good"'),
 (5,
  '0.040*"course" + 0.034*"react" + 0.023*"bob" + 0.017*"learn" + '
  '0.016*"really" + 

In [None]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

lda_topics = lda_model.show_topics(num_words=10)

topics = []
filters = [lambda x: x.upper(), strip_punctuation, strip_numeric]

for topic in lda_topics:
    processed = preprocess_string(topic[1], filters)
    topics.append(processed)

unique_topics_set = set()

# Iterate over each sublist and add unique topic names to the set
for sublist in topics:
    unique_topics_set.update(sublist)
unique_topics_list = list(unique_topics_set)

print("Total Unique Topics: ", len(unique_topics_list))
pprint(unique_topics_list)

Total Unique Topics:  38
['TUTORIAL',
 'HELP',
 'REALLY',
 'EXPLANATION',
 'VIDEO',
 'ONE',
 'GOOD',
 'REACT',
 'MANY',
 'TEACHER',
 'GREAT',
 'LOVE',
 'REDHEART',
 'WAY',
 'PERFECT',
 'TEACH',
 'AMAZING',
 'WORK',
 'THANK',
 'WANT',
 'COURSE',
 'WELL',
 'WRITE',
 'WISH',
 'LEARN',
 'EXPLAIN',
 'CODE',
 'FINISH',
 'GET',
 'MUCH',
 'ALOT',
 'BOB',
 'SKILL',
 'SCRIMBA',
 'COMPLETE',
 'PROJECT',
 'WHOLE',
 'LIKE']


## Trying LDA with Bigrams

In [None]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from gensim.models import Phrases
import gensim.corpora as corpora
from pprint import pprint


# Function to generate bigrams
def generate_bigrams(texts):
    # Build the bigram models
    bigram = Phrases(texts, min_count=2, threshold=2)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod, [bigram_mod[doc] for doc in texts]

data_words = list(sent_to_words(df['comments'].values.tolist()))

# Generate bigrams
bigram_mod, data_words_bigrams = generate_bigrams(data_words)

## Modeling
id2word = corpora.Dictionary(data_words_bigrams)
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]

# number of topics
num_topics = 3
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=42,
                                       chunksize=25,
                                       passes=100,
                                       per_word_topics=True)

doc_lda = lda_model[corpus]
lda_topics = lda_model.show_topics(num_words=10)

topics = []
filters = [lambda x: x.upper(), strip_punctuation, strip_numeric]

for topic in lda_topics:
    processed = preprocess_string(topic[1], filters)
    bigram_topic = bigram_mod[processed]
    topics.append(bigram_topic)

unique_topics_set = set()

# Iterate over each sublist and add unique topic names to the set
for sublist in topics:
    unique_topics_set.update(sublist)
unique_topics_list = list(unique_topics_set)

pprint(unique_topics_list)

['FREE',
 'TUTORIAL',
 'REALLY',
 'HAND',
 'ALSO',
 'REACT',
 'GOOD',
 'MANY',
 'GREAT',
 'AMAZING',
 'TEACH',
 'THANK',
 'COURSE',
 'WELL',
 'LEARN',
 'CODE',
 'BOB',
 'QUALITY',
 'BEGINNER',
 'EVER',
 'TAKE',
 'LIKE']


# Topic Modeling using [BERTopic](https://github.com/MaartenGr/BERTopic)




In [None]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m154.1/154.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m5.2/5.2 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m85.7/85.7

## **Importing Libraries**

In [None]:
import json
import pandas as pd
from bertopic import BERTopic

## Loading the Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
SPECIFIC_VIDEO_COMMENTS_DATASET_FILE = "/content/drive/MyDrive/NLP-Powered-YouTube-Analytics/specific_video_comments.csv"

In [None]:
df = pd.read_csv(SPECIFIC_VIDEO_COMMENTS_DATASET_FILE)
print("Shape of Dataframe: ", df.shape)
df.head()

Shape of Dataframe:  (112, 1)


Unnamed: 0,comments
0,"I have a very good command of CSS, thanks to t..."
1,UNREAL!!!!
2,This is my first time of leaving a comment on ...
3,"I attest to the quality of Bob's teaching, I u..."
4,Hands down the BEST react tutorial I have ever...


In [None]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")


In [None]:
docs = df["comments"]
topics, probs = topic_model.fit_transform(docs)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Getting some Info on our Topics as a DataFrame

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,67,0_the_this_and_course,"[the, this, and, course, is, to, you, of, bob,...",[Am just 2hours into this course...and most ho...
1,1,45,1_react_the_to_this,"[react, the, to, this, and, is, you, course, b...",[This is the best React tutorial I have ever s...


## Get the Keywords for a Topic


In [None]:
topic_model.get_topic(topic=0)

[('the', 0.11220714566643498),
 ('this', 0.10105922871033383),
 ('and', 0.09642755217056344),
 ('course', 0.08287803359215556),
 ('is', 0.08042936950832826),
 ('to', 0.07606510578602008),
 ('you', 0.07112173888752171),
 ('of', 0.05729048611899405),
 ('bob', 0.05149897862802611),
 ('for', 0.05117536445841007)]

In [None]:
topic_model.get_representative_docs(0)


["Am just 2hours into this course...and most honestly I can say that this is one of the best course on web dev i've ever used... hats off to FCC and Bob Zirollüíì",
 "I am now a huge fan of yours both Bob and freeCodecamp team. you guys are really doing great and haven't seen anything like this before. please keep on going. this is the first time that although the video is too long but still want to fill it up with all these knowledge even though it is not my primary/mother language. thanks again Bob for such full and balanced course.",
 "I must say, this course is soooo good. Every video first forces you to think and apply what you have learned. Every concept has been broken down to simplify and make it easier to understand. The scrimba site has been built so well. Haven't seen anything like this where we can interact with the video live. I like the way we can pause and the code is ours. Thanks Bob for this wonderful course. You are a very good instructor :)"]

In [None]:
df = pd.DataFrame({'topic': topics, 'document': docs})
df.loc[df["topic"] == 1]

Unnamed: 0,topic,document
0,1,"I have a very good command of CSS, thanks to t..."
3,1,"I attest to the quality of Bob's teaching, I u..."
4,1,Hands down the BEST react tutorial I have ever...
10,1,"Brilliant way of teaching, reinforcing all the..."
13,1,Finished a 12 hour tutorial and completed all ...
15,1,Hands down the best course so far!!! It's long...
17,1,This is definitely the best React Course! Hand...
21,1,"I'm very thankful for this course, this guy is..."
23,1,I paused at 10:45:00 when the Tenzies game was...
24,1,The thing I like about Bob Ziroll's teaching i...


In [None]:
# topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()