In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import re
try:
    import contractions
except ImportError:
    !pip install contractions
    import contractions

from contractions import fix  # Ensure contractions library is installed

import string
import nltk
# !pip install emoji

# **Loading Dataset**

In [None]:
data = pd.read_csv('/kaggle/input/yt-2k-comments/youtube_comments_2k.csv')  # data contains Validate Dataset
data.shape

In [None]:
data.info()


In [None]:
data.head(5)


# **Preprocessing**

In [None]:
df= pd.DataFrame(data) # df_mix frame contains comments of multiple languages


In [None]:
df.shape


In [None]:
df.isnull().sum()  # count the number of missing values (NaNs) in each column of a DataFrame df.


In [None]:
if df.isnull().values.any():   # removes rows containing missing values (NaNs) from the DataFrame (As missing value present in df, we are removing here the respective rows)
    df.dropna(inplace=True)

In [None]:
df.shape   # shape of df after removing missing values rows


In [None]:
df.nunique()  #used to count the number of unique values in each column of a DataFrame df.


In [None]:
df.info()  #used to get a concise summary of a DataFrame


# **Standard NLP Techniques to Preprocess**

**Removing HTML Tags**

In [None]:
# Function to remove HTML tags
df2= pd.DataFrame(df)
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

df2['Comment'] = df2['Comment'].apply(remove_html_tags)

**Removing URL's**

In [None]:
# Function to remove URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df2['Comment'] = df2['Comment'].apply(remove_url)

**Removing New Lines**

In [None]:
# Function to remove newlines
def remove_newlines(text):
    return text.replace('\n', ' ')

df2['Comment'] = df2['Comment'].apply(remove_newlines)

**Handling Emojis**

In [None]:
!pip install emoji

In [None]:
import emoji
import re
def convert_emojis_to_text(text):
  text = emoji.demojize(text).split(":")
  text = " ".join(text)
  text = re.sub(r'\s+', ' ', text)
  return text

df2['Comment'] = df2['Comment'].apply(convert_emojis_to_text)

**Handling Emoticons**

In [None]:
EMOTICONS = {
    u":‑\)": "Happy face smiley",
    u":\)": "Happy face smiley",
    u":-\]": "Happy face smiley",
    u":\]": "Happy face smiley",
    u":-3": "Happy face smiley",
    u":3": "Happy face smiley",
    u":->": "Happy face smiley",
    u":>": "Happy face smiley",
    u"8-\)": "Happy face smiley",
    u":o\)": "Happy face smiley",
    u":-\}": "Happy face smiley",
    u":\}": "Happy face smiley",
    u":-\)": "Happy face smiley",
    u":c\)": "Happy face smiley",
    u":\^\)": "Happy face smiley",
    u"=\]": "Happy face smiley",
    u"=\)": "Happy face smiley",
    u":‑D": "Laughing, big grin or laugh with glasses",
    u":D": "Laughing, big grin or laugh with glasses",
    u"8‑D": "Laughing, big grin or laugh with glasses",
    u"8D": "Laughing, big grin or laugh with glasses",
    u"X‑D": "Laughing, big grin or laugh with glasses",
    u"XD": "Laughing, big grin or laugh with glasses",
    u"=D": "Laughing, big grin or laugh with glasses",
    u"=3": "Laughing, big grin or laugh with glasses",
    u"B\^D": "Laughing, big grin or laugh with glasses",
    u":-\)\)": "Very happy",
    u":‑\(": "Frown, sad, andry or pouting",
    u":-\(": "Frown, sad, andry or pouting",
    u":\(": "Frown, sad, andry or pouting",
    u":‑c": "Frown, sad, andry or pouting",
    u":c": "Frown, sad, andry or pouting",
    u":‑<": "Frown, sad, andry or pouting",
    u":<": "Frown, sad, andry or pouting",
    u":‑\[": "Frown, sad, andry or pouting",
    u":\[": "Frown, sad, andry or pouting",
    u":-\|\|": "Frown, sad, andry or pouting",
    u">:\[": "Frown, sad, andry or pouting",
    u":\{": "Frown, sad, andry or pouting",
    u":@": "Frown, sad, andry or pouting",
    u">:\(": "Frown, sad, andry or pouting",
    u":'‑\(": "Crying",
    u":'\(": "Crying",
    u":'‑\)": "Tears of happiness",
    u":'\)": "Tears of happiness",
    u"D‑':": "Horror",
    u"D:<": "Disgust",
    u"D:": "Sadness",
    u"D8": "Great dismay",
    u"D;": "Great dismay",
    u"D=": "Great dismay",
    u"DX": "Great dismay",
    u":‑O": "Surprise",
    u":O": "Surprise",
    u":‑o": "Surprise",
    u":o": "Surprise",
    u":-0": "Shock",
    u"8‑0": "Yawn",
    u">:O": "Yawn",
    u":-\*": "Kiss",
    u":\*": "Kiss",
    u":X": "Kiss",
    u";‑\)": "Wink or smirk",
    u";\)": "Wink or smirk",
    u"\*-\)": "Wink or smirk",
    u"\*\)": "Wink or smirk",
    u";‑\]": "Wink or smirk",
    u";\]": "Wink or smirk",
    u";\^\)": "Wink or smirk",
    u":‑,": "Wink or smirk",
    u";D": "Wink or smirk",
    u":‑P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|": "Straight face",
    u":\|": "Straight face",
    u":$": "Embarrassed or blushing",
    u":‑x": "Sealed lips or wearing braces or tongue-tied",
    u":x": "Sealed lips or wearing braces or tongue-tied",
    u":‑#": "Sealed lips or wearing braces or tongue-tied",
    u":#": "Sealed lips or wearing braces or tongue-tied",
    u":‑&": "Sealed lips or wearing braces or tongue-tied",
    u":&": "Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)": "Angel, saint or innocent",
    u"O:\)": "Angel, saint or innocent",
    u"0:‑3": "Angel, saint or innocent",
    u"0:3": "Angel, saint or innocent",
    u"0:‑\)": "Angel, saint or innocent",
    u"0:\)": "Angel, saint or innocent",
    u":‑b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)": "Angel, saint or innocent",
    u">:‑\)": "Evil or devilish",
    u">:\)": "Evil or devilish",
    u"\}:‑\)": "Evil or devilish",
    u"\}:\)": "Evil or devilish",
    u"3:‑\)": "Evil or devilish",
    u"3:\)": "Evil or devilish",
    u">;\)": "Evil or devilish",
    u"\|;‑\)": "Cool",
    u"\|‑O": "Bored",
    u":‑J": "Tongue-in-cheek",
    u"#‑\)": "Party all night",
    u"%‑\)": "Drunk or confused",
    u"%\)": "Drunk or confused",
    u":-###..": "Being sick",
    u":###..": "Being sick",
    u"<:‑\|": "Dump",
    u"\(>_<\)": "Troubled",
    u"\(>_<\)>": "Troubled",
    u"\(';'\)": "Baby",
    u"\(\^\^>``": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz": "Sleeping",
    u"\(\^_-\)": "Wink",
    u"\(\(\+_\+\)\)": "Confused",
    u"\(\+o\+\)": "Confused",
    u"\(o\|o\)": "Ultraman",
    u"\^_\^": "Joyful",
    u"\(\^_\^\)/": "Joyful",
    u"\(\^O\^\)／": "Joyful",
    u"\(\^o\^\)／": "Joyful",
    u"\(__\)": "Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_": "Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>": "Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>": "Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m": "Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m": "Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)": "Sad or Crying",
    u"\(/_;\)": "Sad or Crying",
    u"\(T_T\) \(;_;\)": "Sad or Crying",
    u"\(;_;": "Sad of Crying",
    u"\(;_:\)": "Sad or Crying",
    u"\(;O;\)": "Sad or Crying",
    u"\(:_;\)": "Sad or Crying",
    u"\(ToT\)": "Sad or Crying",
    u";_;": "Sad or Crying",
    u";-;": "Sad or Crying",
    u";n;": "Sad or Crying",
    u";;": "Sad or Crying",
    u"Q\.Q": "Sad or Crying",
    u"T\.T": "Sad or Crying",
    u"QQ": "Sad or Crying",
    u"Q_Q": "Sad or Crying",
    u"\(-\.-\)": "Shame",
    u"\(-_-\)": "Shame",
    u"\(一一\)": "Shame",
    u"\(；一_一\)": "Shame",
    u"\(=_=\)": "Tired",
    u"\(=\^\·\^=\)": "cat",
    u"\(=\^\·\·\^=\)": "cat",
    u"=_\^=	": "cat",
    u"\(\.\.\)": "Looking down",
    u"\(\._\.\)": "Looking down",
    u"\^m\^": "Giggling with hand covering mouth",
    u"\(\・\・?": "Confusion",
    u"\(?_?\)": "Confusion",
    u">\^_\^<": "Normal Laugh",
    u"<\^!\^>": "Normal Laugh",
    u"\^/\^": "Normal Laugh",
    u"\（\*\^_\^\*）": "Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)": "Normal Laugh",
    u"\(^\^\)": "Normal Laugh",
    u"\(\^\.\^\)": "Normal Laugh",
    u"\(\^_\^\.\)": "Normal Laugh",
    u"\(\^_\^\)": "Normal Laugh",
    u"\(\^\^\)": "Normal Laugh",
    u"\(\^J\^\)": "Normal Laugh",
    u"\(\*\^\.\^\*\)": "Normal Laugh",
    u"\(\^—\^\）": "Normal Laugh",
    u"\(#\^\.\^#\)": "Normal Laugh",
    u"\（\^—\^\）": "Waving",
    u"\(;_;\)/~~~": "Waving",
    u"\(\^\.\^\)/~~~": "Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~": "Waving",
    u"\(T_T\)/~~~": "Waving",
    u"\(ToT\)/~~~": "Waving",
    u"\(\*\^0\^\*\)": "Excited",
    u"\(\*_\*\)": "Amazed",
    u"\(\*_\*;": "Amazed",
    u"\(\+_\+\) \(@_@\)": "Amazed",
    u"\(\*\^\^\)v": "Laughing,Cheerful",
    u"\(\^_\^\)v": "Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)": "Headphones,Listening to music",
    u'\(-"-\)': "Worried",
    u"\(ーー;\)": "Worried",
    u"\(\^0_0\^\)": "Eyeglasses",
    u"\(\＾ｖ\＾\)": "Happy",
    u"\(\＾ｕ\＾\)": "Happy",
    u"\(\^\)o\(\^\)": "Happy",
    u"\(\^O\^\)": "Happy",
    u"\(\^o\^\)": "Happy",
    u"\)\^o\^\(": "Happy",
    u":O o_O": "Surprised",
    u"o_0": "Surprised",
    u"o\.O": "Surpised",
    u"\(o\.o\)": "Surprised",
    u"oO": "Surprised",
    u"\(\*￣m￣\)": "Dissatisfied",
    u"\(‘A`\)": "Snubbed or Deflated"
}

In [None]:
def convert_emoticons_to_text(text):
    for emoticon, text_rep in EMOTICONS.items():
        text = re.sub(emoticon, text_rep, text)
    return text

df2['Comment'] = df2['Comment'].apply(convert_emoticons_to_text)

**Text Lowercasing**


In [None]:
df2['Comment'] = df2['Comment'].str.lower()


**Expanding Contractions**


In [None]:
# Function to expand contractions
def expand_contractions(text):
    return fix(text)
df2['Comment'] = df2['Comment'].apply(expand_contractions)

In [None]:
df2.head(5)

**Removing Punctuations**


In [None]:
# Function to remove punctuationdf2.head(15)
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df2['Comment'] = df2['Comment'].apply(remove_punctuation)

In [None]:
print(df2.head(5))
print(df2.shape)
print(df2.info())

# **Comment Clustering**

**1. Using TF-IDF**

**Using TF-IDF with predefined maximum cluster size(K-Means)**



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

def find_optimal_cluster_size(df, max_clusters):
    comments = df['Comment'].tolist()
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(comments)
#     print("Shape of TF-IDF matrix:", tfidf_matrix.shape)

    inertias = []
    silhouette_scores = []

    for num_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(tfidf_matrix)
        clusters = kmeans.labels_
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(tfidf_matrix, clusters)

        inertias.append(inertia)
        silhouette_scores.append(silhouette_avg)

    # Plot Elbow Method
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(range(2, max_clusters + 1), inertias, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')

    # Plot Silhouette Method
    plt.subplot(1, 2, 2)
    plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Method')

    plt.tight_layout()
    plt.show()

    # Find the optimal cluster size based on the highest silhouette score
    optimal_cluster_size = np.argmax(silhouette_scores) + 2
    print("Optimal number of clusters based on Silhouette Method:", optimal_cluster_size)
    print("Silhouette Score TF-IDF(KMEANS Clustering):", silhouette_scores[optimal_cluster_size - 2])

    # Find the optimal cluster size based on the elbow method (just for comparison)
    # Note: The Elbow Method is more subjective and may not always give a clear optimal point
    diff = np.diff(inertias)
    diff_r = diff[1:] / diff[:-1]
    elbow_point = np.argmin(diff_r) + 1
    print("Optimal number of clusters based on Elbow Method:", elbow_point)
    print("Inertia at Optimal Elbow Point:", inertias[elbow_point - 2])

    return optimal_cluster_size

# Example usage
max_clusters = 20  # Maximum number of clusters to consider
optimal_cluster_size = find_optimal_cluster_size(df2, max_clusters)
print("optimal_cluster_size for TF-IDF(KMEANS Clustering): ",optimal_cluster_size)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

def cluster_comments(df, num_clusters):
    comments = df['Comment'].tolist()
    
    # Vectorize comments
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(comments)

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    clusters = kmeans.labels_
    
    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(tfidf_matrix, clusters)
    print(f"Silhouette Score TF-IDF(KMeans Clustering): {silhouette_avg}")

    # Assign cluster labels to DataFrame
    df['Cluster'] = clusters

    return df

# Example usage
def analyze_optimal_cluster(df, optimal_cluster_size):
    df = cluster_comments(df, optimal_cluster_size)
    print("Cluster Analysis:")
    print(df.groupby('Cluster').size())

# Example usage
# You can replace this with the optimal cluster size found previously
print("Optimum_cluster_size: ",optimal_cluster_size)
analyze_optimal_cluster(df2, optimal_cluster_size)


In [None]:
#printing values in each cluster

def analyze_optimal_cluster(df, optimal_cluster_size):
   
    print("\nComments in Each Cluster:")
    for cluster_id in range(optimal_cluster_size):
        cluster_df = df[df['Cluster'] == cluster_id]
        print(f"\nCluster {cluster_id}:")
        for comment in cluster_df['Comment']:
            print(">>",comment)

# Example usage
analyze_optimal_cluster(df2, optimal_cluster_size)


**Using TF-IDF Without Predefined Cluster Size(DBSCAN clustering)**



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

def cluster_comments(df, eps=0.5, min_samples=5):
    # Extract comments from DataFrame
    comments = df['Comment'].tolist()
    
    # Vectorize comments
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.2, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(comments)
#     print("Shape of TF-IDF matrix:", tfidf_matrix.shape)

    # Apply DBSCAN clustering
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(tfidf_matrix)
    
    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(tfidf_matrix, clusters)
    print(f"Silhouette Score for TF-IDF(DBSCAN Clustering): {silhouette_avg}")
    
    return clusters

# Example usage
comment_clusters = cluster_comments(df2)

# print("Comment Clusters: ",comment_clusters)
unique_labels = np.unique(comment_clusters)
print("Distinct Comment Cluster Labels By TF-IDF(DBSCAN Clustering): ", unique_labels)


In [None]:
def print_comments_in_clusters(df, clusters):
    # Iterate over unique cluster labels
    for cluster_label in np.unique(clusters):
        # If cluster_label is -1, it represents noise points
        if cluster_label == -1:
            print("Noise Points:")
            noise_indices = np.where(clusters == cluster_label)[0]
            for index in noise_indices:
                print(df['Comment'][index])
        else:
            print(f"Cluster {cluster_label}:")
            cluster_indices = np.where(clusters == cluster_label)[0]
            for index in cluster_indices:
                print(">> ",df['Comment'][index])
        print("\n")

# Example usage
print_comments_in_clusters(df2, comment_clusters)

**2**. **GloVe's Embeddings.**

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import gensim.downloader as api

def download_glove_embeddings(save_path):
    # Download GloVe embeddings from gensim's API
    glove_vectors = api.load("glove-wiki-gigaword-300")
    # Save the embeddings to a file
    glove_vectors.save_word2vec_format(save_path, binary=True)

def load_word_embeddings(embeddings_file):
    # Load pre-trained word embeddings (GloVe format)
    word_vectors = api.load("glove-wiki-gigaword-300")
    return word_vectors

def comment_to_embedding(comment, word_vectors):
    # Convert a comment to its embedding representation
    words = comment.split()
    # Filter words that are present in the vocabulary
    words = [word for word in words if word in word_vectors]
    if len(words) == 0:
        return None
    # Get word embeddings for the words in the comment
    comment_embeddings = [word_vectors[word] for word in words]
    return np.mean(comment_embeddings, axis=0)

def cluster_comments_with_word_embeddings(comments, word_vectors, num_clusters):
    # Convert comments to embeddings
    comment_embeddings = [comment_to_embedding(comment, word_vectors) for comment in comments]
    # Filter out comments with None embeddings
    comment_embeddings = [embedding for embedding in comment_embeddings if embedding is not None]
    # Convert embeddings to array
    X = np.array(comment_embeddings)
    
    # Reduce dimensionality using PCA
    pca = PCA(n_components=50)  # You can adjust the number of components as needed
    X_pca = pca.fit_transform(X)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X_pca)
    clusters = kmeans.labels_
    
    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(X_pca, clusters)
    print(f"Silhouette Score for GloVe's Word2Vec: {silhouette_avg}")
    return clusters

# Example usage
# Download GloVe embeddings
save_path = "/kaggle/working/glove-wiki-gigaword-300.bin"
download_glove_embeddings(save_path)

# Load pre-trained GloVe embeddings
word_vectors = load_word_embeddings(save_path)

# Cluster comments
num_clusters = 4  # Number of clusters
comment_clusters = cluster_comments_with_word_embeddings(df2['Comment'], word_vectors, num_clusters)


In [None]:
unique_labels = np.unique(comment_clusters)
print("Distinct Comment Cluster Labels By Glove's Word2Vec: ",unique_labels)

**3.** **BERT For Word Embedding**



In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.to(device)  # Move model to GPU if available

def tokenize_and_embed_comments(comments, tokenizer, model, device):
    word_embeddings = []
    for comment in comments:
        # Truncate or chunk the comment to fit within the maximum sequence length
        if len(comment) > tokenizer.model_max_length:
            comment = comment[:tokenizer.model_max_length]  # Truncate to maximum length
        # Tokenize the comment
        tokens = tokenizer.encode(comment, add_special_tokens=False)
        # Convert token IDs to tensor and move to GPU
        tokens_tensor = torch.tensor([tokens]).to(device)
        # Obtain BERT embeddings for the tokens
        with torch.no_grad():
            outputs = model(tokens_tensor)
            embeddings = outputs.last_hidden_state.squeeze(0)  # Take embeddings from the last hidden layer
        # Average the token embeddings to get the comment embedding
        comment_embedding = torch.mean(embeddings, dim=0).cpu().numpy()  # Move back to CPU
        word_embeddings.append(comment_embedding)
    return np.array(word_embeddings)


def cluster_comments_with_bert_embeddings(comments, num_clusters):
    # Tokenize and obtain BERT embeddings for the comments
    word_embeddings = tokenize_and_embed_comments(comments, tokenizer, model, device)
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(word_embeddings)
    clusters = kmeans.labels_
    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(word_embeddings, clusters)
    print(f"Silhouette Score for BERT Word Embeddings: {silhouette_avg}")
    return clusters

# Example usage
comments = df2['Comment'].tolist()
num_clusters = 7  # Number of clusters
comment_clusters = cluster_comments_with_bert_embeddings(comments, num_clusters)


In [None]:
# Load pretrained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.to(device)  # Move model to GPU if available

def generate_sentence_embedding(sentence, tokenizer, model, device, max_length=512):
    # Tokenize the sentence
    tokens = tokenizer.encode(sentence, add_special_tokens=True, max_length=max_length, truncation=True)
    # Convert token IDs to tensor and move to GPU
    tokens_tensor = torch.tensor([tokens]).to(device)
    # Obtain BERT embeddings for the tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
    # Move back to CPU and convert to numpy array
    cls_embedding = cls_embedding.cpu().numpy()
    return cls_embedding

def cluster_comments_with_bert_sentence_embeddings(comments, num_clusters):
    sentence_embeddings = []
    for comment in comments:
        # Generate BERT embedding for each comment
        embedding = generate_sentence_embedding(comment, tokenizer, model, device)
        sentence_embeddings.append(embedding)
    # Convert embeddings to numpy array
    sentence_embeddings = np.array(sentence_embeddings)
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(sentence_embeddings.reshape(len(comments), -1))  # Reshape for clustering
    clusters = kmeans.labels_
    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(sentence_embeddings.reshape(len(comments), -1), clusters)
    print(f"Silhouette Score for BERT Sentense Embeddings: {silhouette_avg}")
    return clusters

# Example usage
comments = df2['Comment'].tolist()
num_clusters = 4  # Number of clusters
comment_clusters = cluster_comments_with_bert_sentence_embeddings(comments, num_clusters)

**4. BigBird**

In [None]:
# Import BigBird model and tokenizer
from transformers import BigBirdModel

# Load pretrained BigBird model and tokenizer
model = BigBirdModel.from_pretrained("google/bigbird-roberta-large",attention_type="original_full")
# tokenizer = BigBirdTokenizer.from_pretrained(model)
# model = BigBirdForSequenceClassification.from_pretrained(model)
model.to(device)  # Move model to GPU if available

# Update generate_sentence_embedding function to use BigBird
def generate_sentence_embedding(sentence, tokenizer, model, device):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    inputs.to(device)
    # Obtain BigBird embeddings for the tokens
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
    # Move back to CPU and convert to numpy array
    cls_embedding = cls_embedding.cpu().numpy()
    return cls_embedding

# The rest of your code remains the same
def cluster_comments_with_bigbird_sentence_embeddings(comments, num_clusters):
    sentence_embeddings = []
    for comment in comments:
        # Generate BERT embedding for each comment
        embedding = generate_sentence_embedding(comment, tokenizer, model, device)
        sentence_embeddings.append(embedding)
    # Convert embeddings to numpy array
    sentence_embeddings = np.array(sentence_embeddings)
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(sentence_embeddings.reshape(len(comments), -1))  # Reshape for clustering
    clusters = kmeans.labels_
    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(sentence_embeddings.reshape(len(comments), -1), clusters)
    print(f"Silhouette Score for BigBird: {silhouette_avg}")
    return clusters

# Example usage
comments = df2['Comment'].tolist()
num_clusters = 4  # Number of clusters
comment_clusters = cluster_comments_with_bigbird_sentence_embeddings(comments, num_clusters)

In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Assuming 'device' is defined earlier as the GPU device, e.g., device = torch.device("cuda")

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()

def encode_comments_bert(df):
    # Encode comments using BERT embeddings

    # Extract comments from DataFrame
    if 'Comment' not in df.columns:
        raise ValueError("DataFrame does not contain 'Comment' column.")
    comments = df['Comment'].tolist()

    encoded_comments = []
    for comment in comments:
        inputs = tokenizer(comment, return_tensors='pt', padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input tensors to GPU

        with torch.no_grad():
            outputs = model(**inputs)
        encoded_comment = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()  # Move output tensor back to CPU

        encoded_comments.append(encoded_comment)
    return np.concatenate(encoded_comments, axis=0)

def cluster_comments_bert(df, num_clusters):
    # Encode comments using BERT embeddings
    bert_embeddings = encode_comments_bert(df)

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    kmeans.fit(bert_embeddings)
    clusters = kmeans.labels_

    # Calculate silhouette score to evaluate clustering quality
    silhouette_avg = silhouette_score(bert_embeddings, clusters)
    print(f"Silhouette Score: {silhouette_avg}")

    return clusters

# Example usage
num_clusters = 7  # Number of clusters
comment_clusters = cluster_comments_bert(df2, num_clusters)
print(np.unique(comment_clusters))


# **Using Topic Modelling**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def cluster_comments_lda(comments, num_topics):
    # Preprocess comments
    vectorizer = CountVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(comments)

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)

    # Get topic distributions for comments
    topic_distributions = lda.transform(X)

    # Assign each comment to the topic with the highest probability
    comment_clusters = np.argmax(topic_distributions, axis=1)

    return comment_clusters

# Example usage
comments = df2['Comment'].tolist()
num_topics = 4  # Number of topics
comment_clusters = cluster_comments_lda(comments, num_topics)
unique_labels = np.unique(comment_clusters)
print("Distinct Labels Using Topic Modelling: ", unique_labels)


In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

def calculate_silhouette_score(comments, comment_clusters):
    # Convert comments to topic distributions
    vectorizer = CountVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(comments)

    # Calculate pairwise cosine similarity between topic distributions of comments
    topic_distributions = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit_transform(X)
    pairwise_similarity = cosine_similarity(topic_distributions)

    # Calculate Silhouette score
    silhouette_avg = silhouette_score(pairwise_similarity, comment_clusters)
    print(f"Silhouette Score for Topic Modelling based Clustering: {silhouette_avg}")

    return silhouette_avg

# Example usage
silhouette_avg = calculate_silhouette_score(comments, comment_clusters)

In [None]:
def print_comments_in_clusters(df, clusters):
    # Iterate over unique cluster labels
    for cluster_label in np.unique(clusters):
        # If cluster_label is -1, it represents noise points
        if cluster_label == -1:
            print("Noise Points:")
            noise_indices = np.where(clusters == cluster_label)[0]
            for index in noise_indices:
                print(df['Comment'][index])
        else:
            print(f"Cluster {cluster_label}:")
            cluster_indices = np.where(clusters == cluster_label)[0]
            for index in cluster_indices:
                print(">> ",df['Comment'][index])
        print("\n")

# Example usage
print_comments_in_clusters(df2, comment_clusters)

# **Using BERTopic**

In [None]:
pip install bertopic


In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load comments from DataFrame
comments = df2['Comment'].astype(str).tolist()  # Ensure comments are converted to strings

# Example function to generate BERT embeddings for comments
def generate_bert_embeddings(comments):
    # Use SentenceTransformer to generate BERT embeddings for the comments
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    model.to(device)  # Move model to GPU if available
    embeddings = model.encode(comments)
    return np.array(embeddings)

# Generate BERT embeddings for comments
comment_embeddings = generate_bert_embeddings(comments)

# Apply BERTopic for clustering
bertopic_model = BERTopic()
topics, probs = bertopic_model.fit_transform(comments)  # Pass comments as strings

# Get the cluster labels for each comment
cluster_labels = topics

unique_labels = np.unique(cluster_labels)
print("Distinct Labels: ", unique_labels)

# Calculate Silhouette score
silhouette_avg = silhouette_score(comment_embeddings, cluster_labels)
print(f"Silhouette Score for BERTopic: {silhouette_avg}")
