# Import Libraries

In [22]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from collections import Counter
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, NMF
from gensim.models import HdpModel, CoherenceModel
import gensim.corpora as corpora
import hdp
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from rake_nltk import Rake
import nltk
from nltk.corpus import stopwords
import yake
from keybert import KeyBERT
import re
import pytextrank
from collections import Counter
from string import punctuation
# from multi_rake import Rake

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.width', 1000)  # Set max width

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Modules

In [6]:
def preprocess_text(text):
    # 1. Lowercase and strip leading/trailing whitespace
    doc = nlp(text.lower().strip())

    tokens = []
    for token in doc:
        # Skip stopwords and punctuation immediately
        if token.is_stop or token.is_punct:
            continue
        
        # We want only “pure” ASCII‐letters (if you also want to allow digits, hyphens or underscores, see the alternate regex below)
        if re.fullmatch(r"[A-Za-z]+", token.text):
            tokens.append(token.lemma_)

    return " ".join(tokens)

In [5]:
def preprocess_text(text):

    doc = nlp(text.lower().strip())  # Lowercase and remove whitespace
    
# Process tokens: lemmatize, filter stopwords/punct/numbers
    tokens = [
        token.lemma_ 
        for token in doc 
        if (
            (not token.is_stop) and
            (not token.is_punct) and 
            (token.is_alpha)
            # (re.fullmatch(r'[A-Za-z0-9 _-]+', token))
        )
    ]
    return ' '.join(tokens)

In [7]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [8]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [9]:
def get_keywords(data , cv, tfidf):

    feature_names=tfidf.get_feature_names()

    #generate tf-idf for the given document
    tf_idf_vector=tfidf.transform(data)

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

In [10]:
def get_hotwords(text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    doc = nlp(text.lower()) 
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text)
    return result

# Pre-Processing

In [11]:
# Load dataset
df = pd.DataFrame()
for i in range(1,3):
    q_df = pd.read_csv(os.getcwd() + '/video_data' + str(i) + '.csv')
    df = pd.concat([df , q_df])
df['processed_title'] = df['title'].apply(preprocess_text)

In [12]:
# TF-IDF vectorization
token_pattern = r"(?u)\b[a-zA-Z]{3,}\b"
tf_vectorizer = TfidfVectorizer(
    min_df= 2 ,
    max_df=0.8 , 
    ngram_range=(1,2), 
    use_idf= True, 
    norm= 'l2', 
    token_pattern= token_pattern
    )
tfidf = tf_vectorizer.fit_transform(df['processed_title'])

tfidf_df = pd.DataFrame(tfidf.toarray(), columns= tf_vectorizer.get_feature_names_out(), index=df.index)
tfidf_df.shape

(2355, 2374)

In [13]:
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(df['processed_title'])

# Keyword Modelling

In [26]:
text = ""

for title in df['title']:
    text += title + " "

## KeyBERT

In [35]:
from keybert import KeyBERT

# Initialize the KeyBERT model
model = KeyBERT('distilbert-base-nli-mean-tokens')

# Extract keywords
keywords = model.extract_keywords(text , top_n= 10)

# Print the keywords
print("Keywords:")
for keyword in keywords:
    print(keyword)

Keywords:
('cryptocurrencie', 0.4062)
('bitcoinnew', 0.4031)
('hacker', 0.4021)
('bitcoin', 0.3831)
('cryptocurrency', 0.3723)
('cryptozombie', 0.3632)
('podcast', 0.3612)
('youtubevideo', 0.3585)
('cryptocrypto', 0.3578)
('brexit', 0.3566)


## YAKE

In [16]:
extractor = yake.KeywordExtractor(top=10)
keywords = [kw[0] for kw in extractor.extract_keywords(text)]

In [17]:
print(keywords)

['programmer explain bitcoin', 'bitcoin bitcoin bitcoin', 'week crypto bitcoin', 'programmer explain crypto', 'crypto bull run', 'crypto market analysis', 'bitcoin ethereum crypto', 'live crypto market', 'crypto crypto market', 'week crypto crypto']


## TextRank

In [None]:
nlp.add_pipe("textrank")
doc = nlp(text)

# examine the top-ranked phrases in the document
for phrase in doc._.phrases[:10]:
    print(phrase.text)

crypto crypto
bitcoin crypto
bitcoin crypto holder
steal bitcoin crypto
bitcoin crypto crypto coin march chance leak whale dump bitcoin
bitcoin bitcoin bitcoin bullish reversal pattern bitcoin target bitcoin
bitcoin crypto market bitcoin miami
crypto dip crypto news
crypto news crypto watchlist
bitcoin crash crypto news


## Keyword Extraction using Spacy

In [43]:
output = set(get_hotwords(text))
most_common_list = Counter(output).most_common(10)
for item in most_common_list:
  print(item[0])

jaanne
june
chart
absolute
panic
bridge
fund
dydx
globalist
deposit


## RAKE

In [34]:
r = Rake()
# Extract keywords from the text
r.extract_keywords_from_text(text)

# Get the ranked keywords
keywords = r.get_ranked_phrases_with_scores()

for score, kw in keywords:
    print("Keyword:", kw, "Score:", score)

Keyword: bankruptcy run amok recession confirmation july fed hide usd supply datum dogecoin week crypto crypto kill ripple case critical plus bank liquidity crypto week crypto bull run oct altcoin season strategy bitcoin nov ask cryptos pump short term watch june crypto week binance cancel usdc bank russia legalize crypto new nft trend crypto week bitcoin bullish news fiat money scam expose hyperinflation usd soon crypto news bitcoin destroy environment fiat bad week crypto bitcoin price prediction time high price drop halve ask answer bitcoin price suppression wall street antic fednow system bad bank dollar hegemony strong crypto use case bitcoin stock market crash soon crypto week crypto crypto adoption development fire cryptothisweek market phase low crypto cycle explain investing creation protection wealth bank poor crypto market random wild guess march exchange run ether eth explode week crypto market bleed underway eth merge fud important sept date crypto news week invest build w