In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commentdata/youtube_english_comments_Nana.csv


# **Importing Libraries**

In [12]:
import re
try:
    import contractions
except ImportError:
    !pip install contractions
    import contractions

from contractions import fix  # Ensure contractions library is installed

import string
import nltk
# !pip install emoji

# **Dataset Loading**

In [13]:
data = pd.read_csv('/kaggle/input/commentdata/youtube_english_comments_Nana.csv')  # data contains Validate Dataset
data.shape

(200, 1)

In [14]:
print(data.info())
print(data.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  200 non-null    object
dtypes: object(1)
memory usage: 1.7+ KB
None
                                             Comment
0  Hope this video was helpful 😊 Do you have expe...
1  "What you understand well, you enunciate clear...
2  Wow, what a bunch of high quality, well-prepar...
3  Your content is by far the best I've found abo...
4  Clear and concise as always. My first time pok...


# **Preprocessing**

In [15]:
df= pd.DataFrame(data) # df_mix frame contains comments of multiple languages
df.shape


(200, 1)

In [16]:
df.isnull().sum()  # count the number of missing values (NaNs) in each column of a DataFrame df.


Comment    0
dtype: int64

In [17]:
if df.isnull().values.any():   # removes rows containing missing values (NaNs) from the DataFrame (As missing value present in df, we are removing here the respective rows)
    df.dropna(inplace=True)

In [18]:
print("Shape: ",df.shape)   # shape of df after removing missing values rows
print("Unique: ",df.nunique())  #used to count the number of unique values in each column of a DataFrame df.
print("Info: ",df.info())  #used to get a concise summary of a DataFrame


Shape:  (200, 1)
Unique:  Comment    99
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  200 non-null    object
dtypes: object(1)
memory usage: 1.7+ KB
Info:  None


**Removing HTML Tags**

In [19]:
# Function to remove HTML tags
df2= pd.DataFrame(df)
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

df2['Comment'] = df2['Comment'].apply(remove_html_tags)

**Removing URL's**

In [21]:
# Function to remove URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)
df2['Comment'] = df2['Comment'].apply(remove_url)

**Removing New Lines**

In [22]:
# Function to remove newlines
def remove_newlines(text):
    return text.replace('\n', ' ')

df2['Comment'] = df2['Comment'].apply(remove_newlines)

**Handling Emojis**

In [23]:
!pip install emoji



In [24]:
import emoji
import re
def convert_emojis_to_text(text):
  text = emoji.demojize(text).split(":")
  text = " ".join(text)
  text = re.sub(r'\s+', ' ', text)
  return text

df2['Comment'] = df2['Comment'].apply(convert_emojis_to_text)

**Handling Emoticons**

In [25]:
EMOTICONS = {
    u":‑\)": "Happy face smiley",
    u":\)": "Happy face smiley",
    u":-\]": "Happy face smiley",
    u":\]": "Happy face smiley",
    u":-3": "Happy face smiley",
    u":3": "Happy face smiley",
    u":->": "Happy face smiley",
    u":>": "Happy face smiley",
    u"8-\)": "Happy face smiley",
    u":o\)": "Happy face smiley",
    u":-\}": "Happy face smiley",
    u":\}": "Happy face smiley",
    u":-\)": "Happy face smiley",
    u":c\)": "Happy face smiley",
    u":\^\)": "Happy face smiley",
    u"=\]": "Happy face smiley",
    u"=\)": "Happy face smiley",
    u":‑D": "Laughing, big grin or laugh with glasses",
    u":D": "Laughing, big grin or laugh with glasses",
    u"8‑D": "Laughing, big grin or laugh with glasses",
    u"8D": "Laughing, big grin or laugh with glasses",
    u"X‑D": "Laughing, big grin or laugh with glasses",
    u"XD": "Laughing, big grin or laugh with glasses",
    u"=D": "Laughing, big grin or laugh with glasses",
    u"=3": "Laughing, big grin or laugh with glasses",
    u"B\^D": "Laughing, big grin or laugh with glasses",
    u":-\)\)": "Very happy",
    u":‑\(": "Frown, sad, andry or pouting",
    u":-\(": "Frown, sad, andry or pouting",
    u":\(": "Frown, sad, andry or pouting",
    u":‑c": "Frown, sad, andry or pouting",
    u":c": "Frown, sad, andry or pouting",
    u":‑<": "Frown, sad, andry or pouting",
    u":<": "Frown, sad, andry or pouting",
    u":‑\[": "Frown, sad, andry or pouting",
    u":\[": "Frown, sad, andry or pouting",
    u":-\|\|": "Frown, sad, andry or pouting",
    u">:\[": "Frown, sad, andry or pouting",
    u":\{": "Frown, sad, andry or pouting",
    u":@": "Frown, sad, andry or pouting",
    u">:\(": "Frown, sad, andry or pouting",
    u":'‑\(": "Crying",
    u":'\(": "Crying",
    u":'‑\)": "Tears of happiness",
    u":'\)": "Tears of happiness",
    u"D‑':": "Horror",
    u"D:<": "Disgust",
    u"D:": "Sadness",
    u"D8": "Great dismay",
    u"D;": "Great dismay",
    u"D=": "Great dismay",
    u"DX": "Great dismay",
    u":‑O": "Surprise",
    u":O": "Surprise",
    u":‑o": "Surprise",
    u":o": "Surprise",
    u":-0": "Shock",
    u"8‑0": "Yawn",
    u">:O": "Yawn",
    u":-\*": "Kiss",
    u":\*": "Kiss",
    u":X": "Kiss",
    u";‑\)": "Wink or smirk",
    u";\)": "Wink or smirk",
    u"\*-\)": "Wink or smirk",
    u"\*\)": "Wink or smirk",
    u";‑\]": "Wink or smirk",
    u";\]": "Wink or smirk",
    u";\^\)": "Wink or smirk",
    u":‑,": "Wink or smirk",
    u";D": "Wink or smirk",
    u":‑P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|": "Straight face",
    u":\|": "Straight face",
    u":$": "Embarrassed or blushing",
    u":‑x": "Sealed lips or wearing braces or tongue-tied",
    u":x": "Sealed lips or wearing braces or tongue-tied",
    u":‑#": "Sealed lips or wearing braces or tongue-tied",
    u":#": "Sealed lips or wearing braces or tongue-tied",
    u":‑&": "Sealed lips or wearing braces or tongue-tied",
    u":&": "Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)": "Angel, saint or innocent",
    u"O:\)": "Angel, saint or innocent",
    u"0:‑3": "Angel, saint or innocent",
    u"0:3": "Angel, saint or innocent",
    u"0:‑\)": "Angel, saint or innocent",
    u"0:\)": "Angel, saint or innocent",
    u":‑b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)": "Angel, saint or innocent",
    u">:‑\)": "Evil or devilish",
    u">:\)": "Evil or devilish",
    u"\}:‑\)": "Evil or devilish",
    u"\}:\)": "Evil or devilish",
    u"3:‑\)": "Evil or devilish",
    u"3:\)": "Evil or devilish",
    u">;\)": "Evil or devilish",
    u"\|;‑\)": "Cool",
    u"\|‑O": "Bored",
    u":‑J": "Tongue-in-cheek",
    u"#‑\)": "Party all night",
    u"%‑\)": "Drunk or confused",
    u"%\)": "Drunk or confused",
    u":-###..": "Being sick",
    u":###..": "Being sick",
    u"<:‑\|": "Dump",
    u"\(>_<\)": "Troubled",
    u"\(>_<\)>": "Troubled",
    u"\(';'\)": "Baby",
    u"\(\^\^>``": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz": "Sleeping",
    u"\(\^_-\)": "Wink",
    u"\(\(\+_\+\)\)": "Confused",
    u"\(\+o\+\)": "Confused",
    u"\(o\|o\)": "Ultraman",
    u"\^_\^": "Joyful",
    u"\(\^_\^\)/": "Joyful",
    u"\(\^O\^\)／": "Joyful",
    u"\(\^o\^\)／": "Joyful",
    u"\(__\)": "Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_": "Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>": "Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>": "Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m": "Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m": "Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)": "Sad or Crying",
    u"\(/_;\)": "Sad or Crying",
    u"\(T_T\) \(;_;\)": "Sad or Crying",
    u"\(;_;": "Sad of Crying",
    u"\(;_:\)": "Sad or Crying",
    u"\(;O;\)": "Sad or Crying",
    u"\(:_;\)": "Sad or Crying",
    u"\(ToT\)": "Sad or Crying",
    u";_;": "Sad or Crying",
    u";-;": "Sad or Crying",
    u";n;": "Sad or Crying",
    u";;": "Sad or Crying",
    u"Q\.Q": "Sad or Crying",
    u"T\.T": "Sad or Crying",
    u"QQ": "Sad or Crying",
    u"Q_Q": "Sad or Crying",
    u"\(-\.-\)": "Shame",
    u"\(-_-\)": "Shame",
    u"\(一一\)": "Shame",
    u"\(；一_一\)": "Shame",
    u"\(=_=\)": "Tired",
    u"\(=\^\·\^=\)": "cat",
    u"\(=\^\·\·\^=\)": "cat",
    u"=_\^=	": "cat",
    u"\(\.\.\)": "Looking down",
    u"\(\._\.\)": "Looking down",
    u"\^m\^": "Giggling with hand covering mouth",
    u"\(\・\・?": "Confusion",
    u"\(?_?\)": "Confusion",
    u">\^_\^<": "Normal Laugh",
    u"<\^!\^>": "Normal Laugh",
    u"\^/\^": "Normal Laugh",
    u"\（\*\^_\^\*）": "Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)": "Normal Laugh",
    u"\(^\^\)": "Normal Laugh",
    u"\(\^\.\^\)": "Normal Laugh",
    u"\(\^_\^\.\)": "Normal Laugh",
    u"\(\^_\^\)": "Normal Laugh",
    u"\(\^\^\)": "Normal Laugh",
    u"\(\^J\^\)": "Normal Laugh",
    u"\(\*\^\.\^\*\)": "Normal Laugh",
    u"\(\^—\^\）": "Normal Laugh",
    u"\(#\^\.\^#\)": "Normal Laugh",
    u"\（\^—\^\）": "Waving",
    u"\(;_;\)/~~~": "Waving",
    u"\(\^\.\^\)/~~~": "Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~": "Waving",
    u"\(T_T\)/~~~": "Waving",
    u"\(ToT\)/~~~": "Waving",
    u"\(\*\^0\^\*\)": "Excited",
    u"\(\*_\*\)": "Amazed",
    u"\(\*_\*;": "Amazed",
    u"\(\+_\+\) \(@_@\)": "Amazed",
    u"\(\*\^\^\)v": "Laughing,Cheerful",
    u"\(\^_\^\)v": "Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)": "Headphones,Listening to music",
    u'\(-"-\)': "Worried",
    u"\(ーー;\)": "Worried",
    u"\(\^0_0\^\)": "Eyeglasses",
    u"\(\＾ｖ\＾\)": "Happy",
    u"\(\＾ｕ\＾\)": "Happy",
    u"\(\^\)o\(\^\)": "Happy",
    u"\(\^O\^\)": "Happy",
    u"\(\^o\^\)": "Happy",
    u"\)\^o\^\(": "Happy",
    u":O o_O": "Surprised",
    u"o_0": "Surprised",
    u"o\.O": "Surpised",
    u"\(o\.o\)": "Surprised",
    u"oO": "Surprised",
    u"\(\*￣m￣\)": "Dissatisfied",
    u"\(‘A`\)": "Snubbed or Deflated"
}

In [26]:
def convert_emoticons_to_text(text):
    for emoticon, text_rep in EMOTICONS.items():
        text = re.sub(emoticon, text_rep, text)
    return text

df2['Comment'] = df2['Comment'].apply(convert_emoticons_to_text)

**Text Lowercasing**

In [27]:
df2['Comment'] = df2['Comment'].str.lower()


**Expanding Contractions**

In [28]:
# Function to expand contractions
def expand_contractions(text):
    return fix(text)
df2['Comment'] = df2['Comment'].apply(expand_contractions)
df2.head(5)

Unnamed: 0,Comment
0,hope this video was helpful smiling_face_with_...
1,"""what you understand well, you enunciate clear..."
2,"wow, what a bunch of high quality, well-prepar..."
3,your content is by far the best i have found a...
4,clear and concise as always. my first time pok...


**Removing Punctuations**

In [29]:
# Function to remove punctuationdf2.head(15)
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df2['Comment'] = df2['Comment'].apply(remove_punctuation)

In [30]:
print(df2.head(5))
print(df2.shape)
print(df2.info())

                                             Comment
0  hope this video was helpful smilingfacewithsmi...
1  what you understand well you enunciate clearly...
2  wow what a bunch of high quality wellprepared ...
3  your content is by far the best i have found a...
4  clear and concise as always my first time poki...
(200, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  200 non-null    object
dtypes: object(1)
memory usage: 1.7+ KB
None


# **Using Topic Modelling**

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def cluster_comments_lda(comments, num_topics):
    # Preprocess comments
    vectorizer = CountVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(comments)

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)

    # Get topic distributions for comments
    topic_distributions = lda.transform(X)

    # Assign each comment to the topic with the highest probability
    comment_clusters = np.argmax(topic_distributions, axis=1)

    return comment_clusters

# Example usage
comments = df2['Comment'].tolist()
num_topics = 4  # Number of topics
comment_clusters = cluster_comments_lda(comments, num_topics)
unique_labels = np.unique(comment_clusters)
print("Distinct Labels Using Topic Modelling: ", unique_labels)


Distinct Labels Using Topic Modelling:  [0 1 2 3]


In [32]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

def calculate_silhouette_score(comments, comment_clusters):
    # Convert comments to topic distributions
    vectorizer = CountVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(comments)

    # Calculate pairwise cosine similarity between topic distributions of comments
    topic_distributions = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit_transform(X)
    pairwise_similarity = cosine_similarity(topic_distributions)

    # Calculate Silhouette score
    silhouette_avg = silhouette_score(pairwise_similarity, comment_clusters)
    print(f"Silhouette Score for Topic Modelling based Clustering: {silhouette_avg}")

    return silhouette_avg

# Example usage
silhouette_avg = calculate_silhouette_score(comments, comment_clusters)

Silhouette Score for Topic Modelling based Clustering: 0.9243767119440074


In [33]:
def print_comments_in_clusters(df, clusters):
    # Iterate over unique cluster labels
    for cluster_label in np.unique(clusters):
        # If cluster_label is -1, it represents noise points
        if cluster_label == -1:
            print("Noise Points:")
            noise_indices = np.where(clusters == cluster_label)[0]
            for index in noise_indices:
                print(df['Comment'][index])
        else:
            print(f"Cluster {cluster_label}:")
            cluster_indices = np.where(clusters == cluster_label)[0]
            for index in cluster_indices:
                print(">> ",df['Comment'][index])
        print("\n")

# Example usage
print_comments_in_clusters(df2, comment_clusters)

Cluster 0:
>>  wow what a bunch of high quality wellprepared education material thank you so much for your effort
>>  outstanding overview of azure devops i learned more in this short concise and wellexplained video that i could have learned by doing a whole bunch of research myself many thanks
>>  this is the first video of yours i have ever seen i subscribed thank you for the high quality content
>>  hello nana  am a very big fan of you because of your teaching style  it was the video i was looking for  because now everyone is asking for azure devopsaws devops  hope you will do azure  aws devops in near future  i have watched many tutorials but none of them have quality content like yours thank you for your time and passion to teach us 
>>  amazing video i learned a lot in a very short amount of time very clear explanation thank you nana you have helped me a lot today smilingfacewithsmilingeyes 
>>  great video a lot of useful information condensed in half an hour
>>  appreciate you 

# **Summarization Using BART Model**

In [42]:
import torch


In [43]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [47]:
import numpy as np
from transformers import BartForConditionalGeneration, BartTokenizer

# Function to generate a summary for a given text using BART
def generate_bart_summary(text):
    # Load tokenizer
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Load model
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest")
    inputs.to(device)

    try:
       # Estimate token length of the transcript
        token_length = inputs.input_ids.size(1)
        max_summary_length = token_length // 2  # Set summary length to half of the estimated token length
        # print("MAX_S_L: ",max_summary_length)

        summary_ids = model.generate(inputs.input_ids, max_length=max_summary_length, num_beams=4, early_stopping=True, min_length=10)

        # Decode and return the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error processing text: {e}")
        summary = ""

    return summary

# Function to break down text into smaller chunks
def chunk_text(text, chunk_size=2048):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Function to generate summaries for each cluster and concatenate them
def summarize_clusters(df, clusters):
    # Initialize an empty list to store summaries of each cluster
    cluster_summaries = []

    # Iterate over unique cluster labels
    for cluster_label in np.unique(clusters):
        # Get comments belonging to the current cluster
        if cluster_label == -1:
            cluster_comments = df.loc[clusters == cluster_label, 'Comment'].tolist()
        else:
            cluster_comments = df.loc[clusters == cluster_label, 'Comment'].tolist()

        # Concatenate comments into a single string
        cluster_text = ' '.join(cluster_comments)

        # Chunk the cluster text into smaller parts
        text_chunks = chunk_text(cluster_text)

        # Generate summary for each chunk and concatenate them
        chunk_summaries = [generate_bart_summary(chunk) for chunk in text_chunks]
        cluster_summary = ' '.join(chunk_summaries)

        # Append the summary to the list of summaries
        cluster_summaries.append(cluster_summary)

    # Concatenate summaries of all clusters
    all_clusters_summary = ' '.join(cluster_summaries)

    return all_clusters_summary

# Example usage
summary = summarize_clusters(df2, comment_clusters)
print(summary)


wow what a bunch of high quality wellprepared education material thank you so much for your effort outstanding overview of azure devops. i learned more in this short concise and wellexplained video that i could have learned by doing a whole bunch of research myself. this is the first video of yours i have ever seen i subscribed thank you for the high quality content. "This is exactly what i needed kudos being excited for this one cyclone  excellent overview and slide material great video cleared a lot of confusions everything is clear" "I learned more in this short concise and wellexplained video that i could have learned by doing a whole bunch of research myself many thanks this is the first video of yours i have ever seen i subscribed" resentation for the company and i am doing poc with some projects and this helps me a lot great video with a very clear breakdown for every part thanks pure gold whata great didactics skills.  azure devops in a nutshell very well teaching i have not an