### Import Necessary Libraries 

In [1]:
#!pip install spacy
#!pip install TextBlob
#!pip install tqdm
#!python -m spacy download en_core_web_sm
#!pip install yake
#!pip install transformers torch
#!python -m spacy download en_core_web_lg
#!pip install bertopic --user
#!pip install hdbscan
#!pip install contextualized-topic-models

In [2]:
import re
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from textblob import download_corpora
download_corpora.main()

from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm.notebook import tqdm
import yake
from transformers import BertTokenizer, BertModel
import torch

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from sklearn.decomposition import NMF
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

import gensim
from gensim import corpora
from gensim.models.phrases import Phrases, Phraser
from tqdm.auto import tqdm
import logging

from gensim.models.coherencemodel import CoherenceModel
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.models.ctm import CTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.datasets.dataset import CTMDataset

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity

from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import pipeline

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ashly\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashly\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ashly\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashl

Finished.


### TedTalk Data Loading 

In [3]:
# Function to load the data from the csv file to a dataframe and print the shape.
def load_data_and_print_shape(file_path):
    df = pd.read_csv(file_path)
    print(f'Shape: {df.shape}')
    return df

tt_df = load_data_and_print_shape('ted_talks.csv')

Shape: (4005, 19)


In [4]:
# Display the columns present in the dataframe.
tt_df.columns

Index(['talk_id', 'title', 'speaker_1', 'all_speakers', 'occupations',
       'about_speakers', 'views', 'recorded_date', 'published_date', 'event',
       'native_lang', 'available_lang', 'comments', 'duration', 'topics',
       'related_talks', 'url', 'description', 'transcript'],
      dtype='object')

In [5]:
# Display the first five rows  of the dataframe.
tt_df.head()

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
0,1,Averting the climate crisis,Al Gore,{0: 'Al Gore'},{0: ['climate advocate']},{0: 'Nobel Laureate Al Gore focused the world’...,3523392,2006-02-25,2006-06-27,TED2006,en,"['ar', 'bg', 'cs', 'de', 'el', 'en', 'es', 'fa...",272.0,977,"['alternative energy', 'cars', 'climate change...","{243: 'New thinking on the climate crisis', 54...",https://www.ted.com/talks/al_gore_averting_the...,With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
1,92,The best stats you've ever seen,Hans Rosling,{0: 'Hans Rosling'},{0: ['global health expert; data visionary']},"{0: 'In Hans Rosling’s hands, data sings. Glob...",14501685,2006-02-22,2006-06-27,TED2006,en,"['ar', 'az', 'bg', 'bn', 'bs', 'cs', 'da', 'de...",628.0,1190,"['Africa', 'Asia', 'Google', 'demo', 'economic...","{2056: ""Own your body's data"", 2296: 'A visual...",https://www.ted.com/talks/hans_rosling_the_bes...,You've never seen data presented like this. Wi...,"About 10 years ago, I took on the task to teac..."
2,7,Simplicity sells,David Pogue,{0: 'David Pogue'},{0: ['technology columnist']},{0: 'David Pogue is the personal technology co...,1920832,2006-02-24,2006-06-27,TED2006,en,"['ar', 'bg', 'de', 'el', 'en', 'es', 'fa', 'fr...",124.0,1286,"['computers', 'entertainment', 'interface desi...","{1725: '10 top time-saving tech tips', 2274: '...",https://www.ted.com/talks/david_pogue_simplici...,New York Times columnist David Pogue takes aim...,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,53,Greening the ghetto,Majora Carter,{0: 'Majora Carter'},{0: ['activist for environmental justice']},{0: 'Majora Carter redefined the field of envi...,2664069,2006-02-26,2006-06-27,TED2006,en,"['ar', 'bg', 'bn', 'ca', 'cs', 'de', 'en', 'es...",219.0,1116,"['MacArthur grant', 'activism', 'business', 'c...",{1041: '3 stories of local eco-entrepreneurshi...,https://www.ted.com/talks/majora_carter_greeni...,"In an emotionally charged talk, MacArthur-winn...",If you're here today — and I'm very happy that...
4,66,Do schools kill creativity?,Sir Ken Robinson,{0: 'Sir Ken Robinson'},"{0: ['author', 'educator']}","{0: ""Creativity expert Sir Ken Robinson challe...",65051954,2006-02-25,2006-06-27,TED2006,en,"['af', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs...",4931.0,1164,"['children', 'creativity', 'culture', 'dance',...","{865: 'Bring on the learning revolution!', 173...",https://www.ted.com/talks/sir_ken_robinson_do_...,Sir Ken Robinson makes an entertaining and pro...,Good morning. How are you? (Audience) Good. It...


### User Data Loading

In [6]:
# Function to load the data from the csv file to a dataframe and print the shape.
def load_user_data(file_path):
    df = pd.read_csv(file_path)
    print(f'Shape: {df.shape}')
    return df

user_df = load_user_data('user_data.csv')

Shape: (1656, 5)


In [7]:
# Display the columns present in the dataframe.
user_df.columns

Index(['User ID', 'Talk ID', 'Rating', 'View Count', 'Comments'], dtype='object')

In [8]:
# Display the first five rows of the dataframe.
user_df.head()

Unnamed: 0,User ID,Talk ID,Rating,View Count,Comments
0,1,1840,4,5,Moved to tears by the emotional rollercoaster ...
1,1,2282,5,8,"""A symphony of thought-provoking ideas. Superb!"""
2,1,2370,2,2,"""Somewhat confusing and hard to follow, to be ..."
3,2,8227,4,6,"""A true gem! This talk was both informative an..."
4,2,1556,2,1,"""The talk's transitions between topics were a ..."


### TedTalk Data Pre-processing 

In [9]:
# Function to extract only the columns required for analysis.
def extract_required_columns(df):
    required_columns = ['talk_id', 'title', 'speaker_1', 'occupations', 'event', 
                        'published_date', 'duration', 'topics', 'description', 
                        'views', 'comments', 'transcript']
    return df[required_columns]
tt_df_sub = extract_required_columns(tt_df)

In [10]:
# Function to check for null values and duplicates in the dataframe.
def check_data_quality(df):
    # Check for null values in the DataFrame.
    null_values = df.isnull().sum()
    print("Null values per column:")
    print(null_values)
    print("\n")

    # Check for duplicate values.
    duplicate_rows = df[df.duplicated()]

    # To see the number of duplicate rows.
    num_duplicate_rows = duplicate_rows.shape[0]
    print(f"Number of duplicate rows: {num_duplicate_rows}\n")

    # View the duplicated rows if any.
    if num_duplicate_rows > 0:
        print("Duplicated rows:")
        print(duplicate_rows)
    else:
        print("No duplicated rows found.")

# Use the function:
check_data_quality(tt_df_sub)

Null values per column:
talk_id             0
title               0
speaker_1           0
occupations       522
event               0
published_date      0
duration            0
topics              0
description         0
views               0
comments          655
transcript          0
dtype: int64


Number of duplicate rows: 0

No duplicated rows found.


In [11]:
# Check for data types of columns.
print(tt_df_sub.dtypes)

talk_id             int64
title              object
speaker_1          object
occupations        object
event              object
published_date     object
duration            int64
topics             object
description        object
views               int64
comments          float64
transcript         object
dtype: object


#### Basic Pre-Processing 

In [12]:
# Function for basic data pre-processing.

def preprocess_dataframe(df):
    def fill_missing_values(df):
        """Replace null values with default values."""
        df['occupations'].fillna('Unknown', inplace=True)
        df['comments'].fillna(0, inplace=True)
        return df
    
    def clean_strings(df, columns, chars_to_remove):
        """Remove specified characters from given columns."""
        remove_dict = {ord(char): None for char in chars_to_remove}
        for col in columns:
            df[col] = df[col].str.translate(remove_dict)
        return df

    def remove_numbers_from_column(df, column):
        """Remove numbers from specified column."""
        df[column] = df[column].str.replace(r'\d+', '', regex=True)
        return df

    def convert_duration_to_minutes(df, column):
        """Convert duration from seconds to minutes."""
        df[column] = (df[column] / 60).round(2)
        return df

    def convert_to_datetime(df, column):
        """Convert specified column to datetime."""
        df[column] = pd.to_datetime(df[column])
        return df

    def extract_year(df, source_column, new_column):
        """Extract year from source column and create new column."""
        df[new_column] = pd.to_datetime(df[source_column]).dt.year
        return df

    def convert_column_to_int64(df, column):
        """Convert specified column to int64."""
        df[column] = df[column].astype('int64')
        return df

    def rearrange_columns(df, columns_order):
        """Rearrange dataframe columns based on given order."""
        return df[columns_order]

    def rename_columns(df, mapping):
        """Rename dataframe columns based on given mapping."""
        return df.rename(columns=mapping)

    # Make a copy to avoid SettingWithCopyWarning.
    df = df.copy()

    df = fill_missing_values(df)
    df = clean_strings(df, ['occupations', 'topics'], ['{', '#', ':', '[', '\'', ';', ',', ']', '}'])
    df = remove_numbers_from_column(df, 'occupations')
    df = convert_duration_to_minutes(df, 'duration')
    df = convert_to_datetime(df, 'published_date')
    df = extract_year(df, 'published_date', 'year')
    df = convert_column_to_int64(df, 'comments')
    df = rearrange_columns(df, [df.columns[-1]] + df.columns[:-1].tolist())
    df = rename_columns(df, {'speaker_1': 'speaker'})

    return df

# Main preprocessing function.
tt_df_sub = preprocess_dataframe(tt_df_sub)

In [13]:
# Verify if the null values are handled.
null_values = tt_df_sub.isnull().sum()
print(null_values)

year              0
talk_id           0
title             0
speaker           0
occupations       0
event             0
published_date    0
duration          0
topics            0
description       0
views             0
comments          0
transcript        0
dtype: int64


In [14]:
# Display first five rows of the cleaned dataframe.
tt_df_sub.head()

Unnamed: 0,year,talk_id,title,speaker,occupations,event,published_date,duration,topics,description,views,comments,transcript
0,2006,1,Averting the climate crisis,Al Gore,climate advocate,TED2006,2006-06-27,16.28,alternative energy cars climate change culture...,With the same humor and humanity he exuded in ...,3523392,272,"Thank you so much, Chris. And it's truly a gre..."
1,2006,92,The best stats you've ever seen,Hans Rosling,global health expert data visionary,TED2006,2006-06-27,19.83,Africa Asia Google demo economics global issue...,You've never seen data presented like this. Wi...,14501685,628,"About 10 years ago, I took on the task to teac..."
2,2006,7,Simplicity sells,David Pogue,technology columnist,TED2006,2006-06-27,21.43,computers entertainment interface design media...,New York Times columnist David Pogue takes aim...,1920832,124,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,2006,53,Greening the ghetto,Majora Carter,activist for environmental justice,TED2006,2006-06-27,18.6,MacArthur grant activism business cities envir...,"In an emotionally charged talk, MacArthur-winn...",2664069,219,If you're here today — and I'm very happy that...
4,2006,66,Do schools kill creativity?,Sir Ken Robinson,author educator,TED2006,2006-06-27,19.4,children creativity culture dance education pa...,Sir Ken Robinson makes an entertaining and pro...,65051954,4931,Good morning. How are you? (Audience) Good. It...


#### Advanced Text Pre-processing

In [15]:
# More Pre-processing.

def preprocess_text(text):
    # Convert text to lowercase.
    text = text.lower()
    
    # Remove text in square brackets & parenthesis.
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)

    # Remove punctuation.
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize.
    tokens = word_tokenize(text)

    # Remove stopwords.
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize.
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Apply preprocessing to the dataframe columns.
tt_df_sub['description'] = tt_df_sub['description'].apply(preprocess_text)
tt_df_sub['transcript'] = tt_df_sub['transcript'].apply(preprocess_text)

### User Data Pre-processing

In [16]:
# Function to pre-process the comments column.

def preprocess_comments(comments):
    # Convert to lowercase.
    comments = comments.lower()
    
    # Remove punctuation and numbers.
    comments = comments.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    # Tokenization.
    tokens = word_tokenize(comments)
    
    # Stop words removal.
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization.
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return ' '.join(lemmatized_tokens)

# Apply the function to each comment.
user_df['Comments'] = user_df['Comments'].apply(preprocess_comments)

### Feature Engineering 

#### Word Count

In [17]:
# Function to determine the word count of description and transcript columns.
def compute_word_counts(df, column_name):
    return df[column_name].apply(lambda x: len(x.split()))

def add_word_counts(df):
    df['description_word_count'] = compute_word_counts(df, 'description')
    df['transcript_word_count'] = compute_word_counts(df, 'transcript')
    return df

tt_df_sub = add_word_counts(tt_df_sub)

#### TF-IDF

In [18]:
# Define a function to compute TF-IDF with sublinear TF scaling.
def compute_tfidf_with_sublinear_scaling(df, column_name, max_features=5000):
    vectorizer = TfidfVectorizer(
        max_df=0.30,
        max_features=max_features,
        stop_words='english',
        sublinear_tf=True 
    )
    tfidf_matrix = vectorizer.fit_transform(df[column_name])
    return vectorizer, tfidf_matrix

# Compute TF-IDF with sublinear TF scaling for description and transcript columns.
description_vectorizer, description_tfidf_matrix = compute_tfidf_with_sublinear_scaling(tt_df_sub, 'description')
transcript_vectorizer, transcript_tfidf_matrix = compute_tfidf_with_sublinear_scaling(tt_df_sub, 'transcript') 

#### N-Grams 

In [19]:
# Function to compute n-grams.
def compute_tfidf_with_ngrams(df, column_name, max_features=5000, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(max_df=0.85, max_features=max_features, stop_words='english', ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(df[column_name])
    return vectorizer, tfidf_matrix

vectorizer, tfidf_matrix = compute_tfidf_with_ngrams(tt_df_sub, 'transcript')

#### Named Entity Recognition 

In [None]:
'''
#Function to perform Named Entity Recognition.

# Load the large English model.
nlp = spacy.load("en_core_web_lg")

def extract_named_entities(text):
    # Defining the entity labels we are interested in
    relevant_entities = ['PERSON', 'ORG', 'GPE', 'DATE', 'EVENT', 'PRODUCT', 'WORK_OF_ART', 'LAW']
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in relevant_entities]

def clean_entities(entities):
    return [ent for ent in entities if len(ent) > 2 and ent.lower() not in STOP_WORDS]

def add_named_entities(df, column_name):
    new_column = f"{column_name}_named_entities"
    tqdm.pandas(desc=f"Processing {column_name}")
    df[new_column] = df[column_name].progress_apply(extract_named_entities)
    df[new_column] = df[new_column].apply(clean_entities)
    return df

tt_df_sub = add_named_entities(tt_df_sub, 'description')
tt_df_sub = add_named_entities(tt_df_sub, 'transcript') '''

#### Keyword Extraction 

In [None]:
'''
# Function to perform keyword extraction.
def extract_keywords(text):
    kw_extractor = yake.KeywordExtractor()
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

# Apply the keyword extraction function to the 'transcript' column with tqdm progress.
tt_df_sub['transcript_keywords'] = tqdm(tt_df_sub['transcript'].apply(extract_keywords), desc='Extracting Keywords')'''

#### Document Embedding using BERT 

In [None]:
'''
# Load pre-trained BERT model and tokenizer.
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)'''

In [None]:
'''
# Function to perform BERT embedding.
def get_bert_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Get BERT embeddings.
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use the embedding of the [CLS] token as representation.
    embedding = outputs.last_hidden_state[:, 0, :].numpy()

    return embedding

# Apply the function and see progress with tqdm.
tt_df_sub['bert_embedding'] = [get_bert_embedding(x) for x in tqdm(tt_df_sub['transcript'])]'''

In [None]:
# Convert DataFrame to CSV and Json for verification.
#tt_df_sub.to_csv('tt_df_sub.csv', index=False)
#tt_df_sub.to_json('tt_df_sub.json', orient='records', lines=True)

### TedTalk Exploratory Data Analysis 

#### Descriptive Statistics 

In [None]:
tt_df_sub.describe()

#### Word Count Distribution 

In [None]:
# Histogram to plot the distribution of description and transcript word count.
plt.figure(figsize=(12, 6))
sns.histplot(tt_df_sub['description_word_count'], kde=True, bins=50)
plt.title('Distribution of Description Word Counts')
plt.show()

plt.figure(figsize=(12, 6))
sns.histplot(tt_df_sub['transcript_word_count'], kde=True, bins=50)
plt.title('Distribution of Transcript Word Counts')
plt.show() 

#### Word Cloud

In [None]:
# Word cloud for transcript.
def generate_wordcloud(text):
    wordcloud = WordCloud(background_color='white', width=800, height=800).generate(text)
    plt.figure(figsize=(8, 8))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

generate_wordcloud(' '.join(tt_df_sub['transcript'].tolist()))

### Model Building: Topic Modeling 

#### 1) LDA

In [None]:
# Split each transcript into words.
texts = tt_df_sub['transcript'].apply(lambda x: x.split()).tolist()

# Build the bigram and trigram models.
bigram = Phrases(texts, min_count=5, threshold=100)  
trigram = Phrases(bigram[texts], threshold=100)

# Construct bigram and trigram.
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Define functions for creating bigrams and trigrams with tqdm progress.
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in tqdm(texts, desc='Making bigrams')]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in tqdm(texts, desc='Making trigrams')]

# Form bigrams and trigrams with tqdm progress.
texts_with_bigrams = make_bigrams(texts)
texts_with_trigrams = make_trigrams(texts_with_bigrams)

# Recreate the dictionary to include bigrams and trigrams.
dictionary = corpora.Dictionary(texts_with_trigrams)

# Filter out extremes to remove tokens that appear too frequently or infrequently.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Convert the documents to a bag-of-words representation with tqdm progress.
corpus = [dictionary.doc2bow(text) for text in tqdm(texts_with_trigrams, desc='Creating corpus')]

# Set up logging for progress tracking.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Train LDA model with progress updates and additional hyperparameters.
NUM_TOPICS = 15
ldamodel = gensim.models.LdaMulticore(
    corpus,
    num_topics=NUM_TOPICS,
    id2word=dictionary,
    passes=35,
    workers=3,
    alpha='asymmetric',  
    eta='auto',        
    random_state=100,   
    per_word_topics=True, 
    minimum_probability=0.01
)

# Print topics.
ldatopics = ldamodel.print_topics(num_words=10)
for topic in ldatopics:
    print(topic)

#### Evaluation of LDA 

In [None]:
# Coherence Score.
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#### 2) LSA 

In [None]:
from sklearn.decomposition import TruncatedSVD

# Define the number of topics
NUM_TOPICS = 15

# Step 1: Apply LSA
lsa_model = TruncatedSVD(n_components=NUM_TOPICS, random_state=42)
lsa_model.fit(transcript_tfidf_matrix)

# Step 2: Extract and display topics
feature_names = transcript_vectorizer.get_feature_names_out()
for i, topic in enumerate(lsa_model.components_):
    top_features_ind = topic.argsort()[:-10 - 1:-1]
    top_features = [feature_names[index] for index in top_features_ind]
    weights = topic[top_features_ind]

    print(f"Topic {i+1}:")
    print(", ".join(top_features))
    print("\n")

#### Evaluation of LSA 

In [None]:
# Prepare the list of top words for each topic from LSA.
top_n = 10  # Number of top words to consider for each topic.
topics = [[feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]] for topic in lsa_model.components_]

# Prepare the Gensim dictionary and corpus.
dictionary = Dictionary(texts_with_trigrams)
corpus = [dictionary.doc2bow(text) for text in texts_with_trigrams]

# Create the Coherence Model.
coherence_model = CoherenceModel(topics=topics, texts=texts_with_trigrams, dictionary=dictionary, coherence='c_v')

# Compute the Coherence Score.
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

#### 3) Non-Negative Matrix Factorization (NMF)

In [20]:
num_topics = 15

# Create an NMF model.
nmf_model = NMF(
    n_components=num_topics,        # Number of topics.
    init='nndsvd',                 # Initialization method .
    random_state=42,               # Random seed for reproducibility.
    alpha=0.1,                   # Regularization parameters.
    l1_ratio=0.8,                  # Sparsity parameter. 
    max_iter=200,                  # Maximum number of iterations.
    verbose=1                      # Verbosity level (1 for progress updates).
)

# Fit the model to the 'transcript' TF-IDF matrix.
nmf_model.fit(transcript_tfidf_matrix)

# Get the topics and the topic-word distributions.
nmftopics = nmf_model.components_

n_top_words = 10  
feature_names = transcript_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmftopics):
    top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")



violation: 1.0
violation: 0.31919591865528496
violation: 0.15128732919873691
violation: 0.08856207734618604
violation: 0.06177094804089835
violation: 0.04790379465885834
violation: 0.038413986696961916
violation: 0.03103121305108189
violation: 0.025104880946314877
violation: 0.020390443160571224
violation: 0.01676954078338039
violation: 0.01383124040562342
violation: 0.011537328903220472
violation: 0.009703092688836026
violation: 0.008293561550704544
violation: 0.007170082145667308
violation: 0.0062766434413592375
violation: 0.005560448889492952
violation: 0.004978807466155058
violation: 0.004504132603744751
violation: 0.004117634664718943
violation: 0.0038051863169598155
violation: 0.003546878279343285
violation: 0.0033341345321302005
violation: 0.0031523735336994704
violation: 0.00298877632589311
violation: 0.0028427033207806835
violation: 0.0027058606370142396
violation: 0.0025780799665862356
violation: 0.002456316625225052
violation: 0.002332966492131374
violation: 0.00220753643923

#### Evaluation of Non-Negative Matrix Factorization (NMF)

In [21]:
# Create a Gensim Dictionary.
transcript_dict = Dictionary(tt_df_sub['transcript'].apply(str.split))

# Convert the NMF topics into a format suitable for coherence calculation.
nmf_topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in nmftopics]

# Calculate the coherence score using the 'c_v' measure.
coherence_model = CoherenceModel(topics=nmf_topics, texts=tt_df_sub['transcript'].apply(str.split), dictionary=transcript_dict, coherence='c_v')
coherence_score = coherence_model.get_coherence()

# Print the coherence score.
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.7340553160984783


#### 4) CTM 

In [None]:
# Prepare the data for CTM.
tp = TopicModelDataPreparation("bert-base-nli-mean-tokens")

training_dataset = tp.fit(text_for_contextual=tt_df['transcript'].tolist(), 
                          text_for_bow=tt_df_sub['transcript'].tolist())

In [None]:
# Define the size of the vocabulary and contextual embeddings.
bow_size = len(tp.vocab)  
contextual_size = 768  

num_topics = 10  
num_epochs = 10  

# Initialize the CombinedTM model.
ctm = CombinedTM(bow_size=bow_size, contextual_size=contextual_size, n_components=num_topics, num_epochs=num_epochs)

# Train the model.
ctm.fit(training_dataset)

# Extract and display the topics.
topics = ctm.get_topic_lists(10)
for topic_idx, topic in enumerate(topics):
    print(f"Topic {topic_idx + 1}: {', '.join(topic)}")

#### Evaluation of CTM 

In [None]:
# Convert the CTM topics to a format suitable for coherence calculation.
ctm_topics = ctm.get_topic_lists(10)

# Create a dictionary and corpus for the topics.
dictionary = corpora.Dictionary(ctm_topics)
corpus = [dictionary.doc2bow(topic) for topic in ctm_topics]

# Calculate the coherence score using UMass coherence (you can also use C_v coherence).
coherence_model = CoherenceModel(model=None, corpus=corpus, dictionary=dictionary, coherence='u_mass', topics=ctm_topics)
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")

#### Topic Distribution for NMF

This model performs better as compared to LDA, LSA and CMT. Hence it is selected for further processing.

In [22]:
# Get the topic distributions for each document from NMF.
nmf_topic_distributions = nmf_model.transform(transcript_tfidf_matrix)

violation: 1.0
violation: 0.10164830173445182
violation: 0.0051536844945916606
violation: 0.000539188364636062
violation: 5.325244079046646e-05
Converged at iteration 6




In [23]:
# Creating a DataFrame from the NMF topic distributions.
topic_dist_df = pd.DataFrame(nmf_topic_distributions, columns=[f"Topic_{i+1}" for i in range(nmf_topic_distributions.shape[1])])

# Display the first few rows of the DataFrame to verify.
print(topic_dist_df.head())

    Topic_1   Topic_2   Topic_3  Topic_4  Topic_5   Topic_6  Topic_7  \
0  0.059413  0.040164  0.029419      0.0      0.0  0.005039      0.0   
1  0.043325  0.000000  0.094778      0.0      0.0  0.000000      0.0   
2  0.079526  0.000000  0.000000      0.0      0.0  0.000000      0.0   
3  0.068771  0.022616  0.055075      0.0      0.0  0.000000      0.0   
4  0.084162  0.000000  0.000000      0.0      0.0  0.000000      0.0   

    Topic_8   Topic_9  Topic_10  Topic_11  Topic_12  Topic_13  Topic_14  \
0  0.000000  0.000000       0.0  0.000000  0.000000       0.0  0.000000   
1  0.000000  0.020898       0.0  0.001317  0.038808       0.0  0.000000   
2  0.000000  0.044134       0.0  0.000000  0.000000       0.0  0.000000   
3  0.072825  0.000000       0.0  0.000000  0.000000       0.0  0.000000   
4  0.000000  0.000000       0.0  0.034352  0.000000       0.0  0.015967   

   Topic_15  
0       0.0  
1       0.0  
2       0.0  
3       0.0  
4       0.0  


#### Clustering 

In [24]:
# Choose the number of clusters.
num_clusters = 12  

# Perform K-Means clustering.
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(topic_dist_df)

# Add the cluster information to the DataFrame.
topic_dist_df['Cluster'] = clusters

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [25]:
# Merge the cleaned TED Talks DataFrame with the topic distribution DataFrame and clusters.
cluster_topic_df = pd.concat([tt_df_sub, topic_dist_df], axis=1)

In [26]:
# Convert it to a csv file.
cluster_topic_df.to_csv('cluster_topic_df.csv', index=False)

#### Evaluation of Clustering

In [27]:
# Silhouette Score
silhouette = silhouette_score(topic_dist_df, clusters)

# Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(topic_dist_df, clusters)

# Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(topic_dist_df, clusters)

print(f"Silhouette Score: {silhouette}")
print(f"Calinski-Harabasz Index: {calinski_harabasz}")
print(f"Davies-Bouldin Index: {davies_bouldin}")

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Silhouette Score: 0.9510161374347714
Calinski-Harabasz Index: 2272525.173119256
Davies-Bouldin Index: 0.0829712276575479


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


### Model Building: Sentiment Analysis

#### 1) DistilBert

In [28]:
# Load DistilBERT tokenizer and model.
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Create a pipeline using the loaded model and tokenizer.
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def analyze_sentiment(comment):
    result = sentiment_pipeline(comment)
    return result[0]

# Apply the function to the 'Comments' column.
user_df['Sentiment_Analysis'] = user_df['Comments'].apply(lambda x: analyze_sentiment(x))

# Display the results.
print(user_df[['Comments', 'Sentiment_Analysis']])

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


                                          Comments  \
0          moved tear emotional rollercoaster talk   
1            symphony thoughtprovoking idea superb   
2            somewhat confusing hard follow honest   
3           true gem talk informative entertaining   
4                  talk transition topic bit rough   
...                                            ...   
1651           conclusion drawn weak unfortunately   
1652                   lecture talk wasnt engaging   
1653              speaker explanation unclear felt   
1654              lacked wow factor expected sadly   
1655  left wanting substance expecting deeper dive   

                                     Sentiment_Analysis  
0     {'label': 'POSITIVE', 'score': 0.9970806241035...  
1     {'label': 'POSITIVE', 'score': 0.999852180480957}  
2     {'label': 'POSITIVE', 'score': 0.7286676168441...  
3     {'label': 'POSITIVE', 'score': 0.9998641014099...  
4     {'label': 'NEGATIVE', 'score': 0.998521625995636}  
...

#### 2) RoBERTa

In [29]:
# Load the sentiment analysis pipeline with the specific RoBERTa model.
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Function to analyze sentiment using the pipeline.
def analyze_sentiment_roberta(comment):
    try:
        result = sentiment_pipeline(comment)
        label = result[0]['label']

        # Map numerical labels to text
        label_mapping = {
            'LABEL_0': 'negative',
            'LABEL_1': 'neutral',
            'LABEL_2': 'positive'
        }

        return label_mapping.get(label, "unknown")
    except Exception as e:
        print(f"Error processing comment: {comment}. Error: {e}")
        return "error"

# Apply the function to the DataFrame.
user_df['Sentiment_RoBERTa'] = user_df['Comments'].apply(analyze_sentiment_roberta)

# Display the results.
print(user_df[['Comments', 'Sentiment_RoBERTa']].head())

                                  Comments Sentiment_RoBERTa
0  moved tear emotional rollercoaster talk          positive
1    symphony thoughtprovoking idea superb          positive
2    somewhat confusing hard follow honest          negative
3   true gem talk informative entertaining          positive
4          talk transition topic bit rough          negative


RoBERTa will be chosen for sentiment analysis as it performs better as compared to DistilBert.

In [32]:
# Convert it to a csv file.
user_df.to_csv('usersenti_df.csv', index=False)

### Recommendation System 

#### 1) Content-Based Filtering

#### 2) Collaborative Filtering

#### 3) Hybrid Model