# Import Libraries

In [18]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from collections import Counter
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, NMF
from gensim.models import HdpModel, CoherenceModel
import gensim.corpora as corpora
import hdp
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from rake_nltk import Rake
import nltk
from nltk.corpus import stopwords
import yake
from keybert import KeyBERT
import re

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.width', 1000)  # Set max width

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Modules

In [27]:
def preprocess_text(text):
    # 1. Lowercase and strip leading/trailing whitespace
    doc = nlp(text.lower().strip())

    tokens = []
    for token in doc:
        # Skip stopwords and punctuation immediately
        if token.is_stop or token.is_punct:
            continue
        
        # We want only “pure” ASCII‐letters (if you also want to allow digits, hyphens or underscores, see the alternate regex below)
        if re.fullmatch(r"[A-Za-z]+", token.text):
            tokens.append(token.lemma_)

    return " ".join(tokens)

In [24]:
def preprocess_text(text):

    doc = nlp(text.lower().strip())  # Lowercase and remove whitespace
    
# Process tokens: lemmatize, filter stopwords/punct/numbers
    tokens = [
        token.lemma_ 
        for token in doc 
        if (
            (not token.is_stop) and
            (not token.is_punct) and 
            (token.is_alpha)
            # (re.fullmatch(r'[A-Za-z0-9 _-]+', token))
        )
    ]
    return ' '.join(tokens)

In [5]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [6]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [7]:
def get_keywords(data , cv, tfidf):

    feature_names=tfidf.get_feature_names()

    #generate tf-idf for the given document
    tf_idf_vector=tfidf.transform(data)

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

# Pre-Processing

In [28]:
# Load dataset
df = pd.DataFrame()
for i in range(1,3):
    q_df = pd.read_csv(os.getcwd() + '/video_data' + str(i) + '.csv')
    df = pd.concat([df , q_df])
df['processed_title'] = df['title'].apply(preprocess_text)

In [29]:
# TF-IDF vectorization
token_pattern = r"(?u)\b[a-zA-Z]{3,}\b"
tf_vectorizer = TfidfVectorizer(
    min_df= 2 ,
    max_df=0.8 , 
    ngram_range=(1,2), 
    use_idf= True, 
    norm= 'l2', 
    token_pattern= token_pattern
    )
tfidf = tf_vectorizer.fit_transform(df['processed_title'])

tfidf_df = pd.DataFrame(tfidf.toarray(), columns= tf_vectorizer.get_feature_names_out(), index=df.index)
tfidf_df.shape

(2355, 2374)

In [30]:
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(df['processed_title'])

# Keyword Modelling

In [31]:
text = ""

for title in df['processed_title']:
    text += title + " "

## KeyBERT

In [34]:
df['title'].head()

0                                       Coinex: Complete Guide To Leveraged Crypto Trading In 2025
1                                  Building a Blockchain in Under 15 Minutes - Programmer explains
2    $300M+ cryptocurrency hacks. How they happened and what we've learned. Ivan on Tech explains.
3                                            Programmer explains Ethereum | Future of the Internet
4                   Bitcoin Hard Fork - Risk of losing Bitcoin in November?  - Programmer explains
Name: title, dtype: object

In [35]:
from keybert import KeyBERT

# Initialize the KeyBERT model
model = KeyBERT('distilbert-base-nli-mean-tokens')

# Extract keywords
keywords = model.extract_keywords(text , top_n= 10)

# Print the keywords
print("Keywords:")
for keyword in keywords:
    print(keyword)

Keywords:
('cryptocurrencie', 0.4062)
('bitcoinnew', 0.4031)
('hacker', 0.4021)
('bitcoin', 0.3831)
('cryptocurrency', 0.3723)
('cryptozombie', 0.3632)
('podcast', 0.3612)
('youtubevideo', 0.3585)
('cryptocrypto', 0.3578)
('brexit', 0.3566)


## YAKE

In [16]:
extractor = yake.KeywordExtractor(top=10)
keywords = [kw[0] for kw in extractor.extract_keywords(text)]

In [17]:
print(keywords)

['programmer explain bitcoin', 'bitcoin bitcoin bitcoin', 'week crypto bitcoin', 'programmer explain crypto', 'crypto bull run', 'crypto market analysis', 'bitcoin ethereum crypto', 'live crypto market', 'crypto crypto market', 'week crypto crypto']
