<a href="https://colab.research.google.com/github/BandaAkshitha/Natural-Language-Processing/blob/main/NLP_6_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LDA with sample data

In [None]:
import pandas as pd
data=pd.read_excel('/content/NLP_Corpus.xlsx')
display(data.head())

Unnamed: 0,Document ID,Text
0,1,Virat scored a century in the match
1,2,BJP won in elections
2,3,Bumrah took five wickets in a match
3,4,Congress formed the state government


In [None]:
corpus = [
    "Virat scored century in match",
    "BJP won in elections",
    "Bumrah took 5 wicket in a match",
    "Congress form state government"
]

In [None]:
documents = corpus

Text Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stop_words.remove('won')
lemmatizer = WordNetLemmatizer()

processed_docs = []



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Clean → Tokenize → Stopwords → Lemma → Rejoin

In [None]:
for doc in documents:
    # Lowercase
    doc = doc.lower()

    # Remove numbers & punctuation
    doc = re.sub(r'[^a-z\s]', '', doc)

    # Tokenization
    tokens = nltk.word_tokenize(doc)

    # Stopword removal + Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Rejoin
    processed_docs.append(" ".join(tokens))

processed_docs

['virat scored century match',
 'bjp won election',
 'bumrah took wicket match',
 'congress form state government']

Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(processed_docs)

BoW as DataFrame

In [None]:
import pandas as pd

bow_df = pd.DataFrame(
    bow.toarray(),
    columns=vectorizer.get_feature_names_out()
)

bow_df

Unnamed: 0,bjp,bumrah,century,congress,election,form,government,match,scored,state,took,virat,wicket,won
0,0,0,1,0,0,0,0,1,1,0,0,1,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,1,0,0,1,0,1,0
3,0,0,0,1,0,1,1,0,0,1,0,0,0,0


Apply LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    n_components=2,
    random_state=42
)

lda.fit(bow)

Identify Words for Each Topic

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, 5)


Topic 1:
['match', 'scored', 'century', 'virat', 'wicket']

Topic 2:
['form', 'state', 'government', 'congress', 'won']


LDA with Kaggle

In [None]:
import pandas as pd
data=pd.read_csv('/content/arxiv_data.csv')
display(data.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


Import Libraries

In [None]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

Download required NLTK resources

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Prepare Corpus

In [None]:
corpus = data['titles'].dropna().tolist()

Load Text Data

In [None]:
print(corpus[0])

Survey on Semantic Stereo Matching / Semantic Depth Estimation


Text Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove numbers & special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords & short words
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin
    return ' '.join(tokens)

Apply preprocessing

In [None]:
clean_corpus = [preprocess_text(doc) for doc in corpus]

Bag of Words

In [None]:
vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2
)

bow = vectorizer.fit_transform(clean_corpus)

BoW as DataFrame

In [None]:
bow_df = pd.DataFrame(
    bow.toarray(),
    columns=vectorizer.get_feature_names_out()
)

bow_df.head()

Unnamed: 0,aaa,aadnet,aaformer,aamdrl,aaseg,abandoning,abc,abcnet,abdmot,abdnet,...,zhu,zigzag,zone,zoo,zoom,zoomin,zooming,zoomintocheck,zoomnet,zootuning
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Apply LDA

Identify Words for Each Topic

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(" ".join([
            feature_names[i]
            for i in topic.argsort()[:-no_top_words - 1:-1]
        ]))

In [None]:
display_topics(
    lda,
    vectorizer.get_feature_names_out(),
    no_top_words=10
)


Topic 1:
network image adversarial generative segmentation generation semantic learning neural deep

Topic 2:
network graph neural convolutional detection using image attention recognition deep

Topic 3:
learning reinforcement representation unsupervised using deep visual depth via estimation

Topic 4:
detection object learning point cloud image data model network transfer

Topic 5:
learning deep image reinforcement time using representation data series graph


Final Topic Modeling Result (Document → Topic)

In [None]:
doc_topics = lda.transform(bow)

topic_df = pd.DataFrame(
    doc_topics,
    columns=[f"Topic {i+1}" for i in range(lda.n_components)]
)

topic_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,0.282688,0.025341,0.641133,0.025471,0.025367
1,0.427584,0.016758,0.016944,0.016907,0.521807
2,0.414548,0.020543,0.523916,0.02053,0.020464
3,0.02598,0.025319,0.02556,0.025422,0.89772
4,0.029117,0.88248,0.028587,0.02982,0.029996


In [None]:
topic_df['Dominant_Topic'] = topic_df.idxmax(axis=1)
topic_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Dominant_Topic
0,0.282688,0.025341,0.641133,0.025471,0.025367,Topic 3
1,0.427584,0.016758,0.016944,0.016907,0.521807,Topic 5
2,0.414548,0.020543,0.523916,0.02053,0.020464,Topic 3
3,0.02598,0.025319,0.02556,0.025422,0.89772,Topic 5
4,0.029117,0.88248,0.028587,0.02982,0.029996,Topic 2


NMF with Sample Data(BOW)

In [None]:
import pandas as pd
data=pd.read_excel('/content/NLP_Corpus.xlsx')
display(data.head())

Unnamed: 0,Document ID,Text
0,1,Virat scored a century in the match
1,2,BJP won in elections
2,3,Bumrah took five wickets in a match
3,4,Congress formed the state government


Import required Libraries&Download NLTK Resources

In [None]:
import re
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
documents = [
    "Virat scored a century in the match",
    "BJP won in elections",
    "Bumrah took five wickets in a match",
    "Congress formed the state government"
]

In [None]:
corpus = documents
print(corpus)

['Virat scored a century in the match', 'BJP won in elections', 'Bumrah took five wickets in a match', 'Congress formed the state government']


Text Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.remove('won')
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)   # remove special characters & numbers
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

Apply preprocessing

In [None]:
clean_corpus = [preprocess_text(doc) for doc in corpus]
clean_corpus

['virat scored century match',
 'bjp won election',
 'bumrah took five wicket match',
 'congress formed state government']

Bag of Words for NMF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(clean_corpus)

BOW as DataFrame

In [None]:
import pandas as pd

bow_df = pd.DataFrame(
    bow.toarray(),
    columns=vectorizer.get_feature_names_out()
)

bow_df

Unnamed: 0,bjp,bumrah,century,congress,election,five,formed,government,match,scored,state,took,virat,wicket,won
0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0
3,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0


Apply NMF

In [None]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=2, random_state=42)
nmf.fit(bow)

Identify Words for Each Topic

In [None]:
def display_topics(model, feature_names, top_words):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx + 1}:")
        print(" ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

In [None]:
display_topics(nmf, vectorizer.get_feature_names_out(), 5)


Topic 1:
match took wicket bumrah five

Topic 2:
state formed government congress election


Topic Modeling(Document → Topic)

In [None]:
doc_topic = nmf.transform(bow)

topic_df = pd.DataFrame(
    doc_topic,
    columns=["Sports Topic", "Politics Topic"]
)

topic_df

Unnamed: 0,Sports Topic,Politics Topic
0,0.5389437,0.0
1,6.579722e-08,0.000402
2,0.872031,0.0
3,0.0,1.045276


In [None]:
topic_df['Dominant_Topic'] = topic_df.idxmax(axis=1)
topic_df

Unnamed: 0,Sports Topic,Politics Topic,Dominant_Topic
0,0.5389437,0.0,Sports Topic
1,6.579722e-08,0.000402,Politics Topic
2,0.872031,0.0,Sports Topic
3,0.0,1.045276,Politics Topic


NMF with Kaggle(BOW)

In [None]:
import pandas as pd
data=pd.read_csv('/content/arxiv_data.csv')
print(data)

                                                  titles  \
0      Survey on Semantic Stereo Matching / Semantic ...   
1      FUTURE-AI: Guiding Principles and Consensus Re...   
2      Enforcing Mutual Consistency of Hard Regions f...   
3      Parameter Decoupling Strategy for Semi-supervi...   
4      Background-Foreground Segmentation for Interio...   
...                                                  ...   
51769  Hierarchically-coupled hidden Markov models fo...   
51770                         Blinking Molecule Tracking   
51771  Towards a Mathematical Foundation of Immunolog...   
51772  A Semi-Automatic Graph-Based Approach for Dete...   
51773  SparseCodePicking: feature extraction in mass ...   

                                               summaries  \
0      Stereo matching is one of the widely used tech...   
1      The recent advancements in artificial intellig...   
2      In this paper, we proposed a novel mutual cons...   
3      Consistency training has proven 

Import required Libraries

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
corpus = data['titles'].dropna().tolist()

Text Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove numbers & special characters
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

Apply Preprocessing

In [None]:
clean_corpus = [preprocess_text(doc) for doc in corpus]

Bag of Words

In [None]:
vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2
)

bow = vectorizer.fit_transform(clean_corpus)

BoW as DataFrame

In [None]:
bow_df = pd.DataFrame(
    bow.toarray(),
    columns=vectorizer.get_feature_names_out()
)

bow_df.head()

Unnamed: 0,aaa,aadnet,aaformer,aamdrl,aaseg,abandoning,abc,abcnet,abdmot,abdnet,...,zhu,zigzag,zone,zoo,zoom,zoomin,zooming,zoomintocheck,zoomnet,zootuning
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Apply NMF

In [None]:
nmf = NMF(
    n_components=5,
    random_state=42
)

nmf.fit(bow)

Identify Words for Each Topic

In [None]:
def display_topics(model, feature_names, top_words):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx + 1}:")
        print(" ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))
display_topics(
    nmf,
    vectorizer.get_feature_names_out(),
    top_words=10
)


Topic 1:
learning reinforcement representation via transfer unsupervised graph machine data selfsupervised

Topic 2:
network neural graph convolutional adversarial generative using attention via prediction

Topic 3:
image segmentation using medical semantic based generation adversarial classification generative

Topic 4:
detection object using video feature point salient cloud based anomaly

Topic 5:
deep using model data classification time based series neural prediction


Topic Modeling (Document → Topic)

In [None]:
doc_topic = nmf.transform(bow)

topic_df = pd.DataFrame(
    doc_topic,
    columns=[f"Topic {i+1}" for i in range(nmf.n_components)]
)

topic_df.head()
topic_df['Dominant_Topic'] = topic_df.idxmax(axis=1)
topic_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Dominant_Topic
0,0.001633,0.002225,0.021521,0.007925,0.005932,Topic 3
1,0.00081,0.000307,0.009523,0.0,0.000384,Topic 3
2,0.0,0.0,0.141636,0.0,0.0,Topic 3
3,0.000904,0.0,0.04206,0.0,0.0,Topic 3
4,0.0,0.0,0.040913,0.0,0.0,Topic 3


NMF With sample data(TF-IDF)

In [None]:
import pandas as pd
data=pd.read_excel('/content/NLP_Corpus.xlsx')
display(data.head())

Unnamed: 0,Document ID,Text
0,1,Virat scored a century in the match
1,2,BJP won in elections
2,3,Bumrah took five wickets in a match
3,4,Congress formed the state government


Prepare Corpus

In [None]:
documents = [
    "Virat scored a century in the match",
    "BJP won in elections",
    "Bumrah took five wickets in a match",
    "Congress formed the state government"
]
corpus = documents
print(corpus)

['Virat scored a century in the match', 'BJP won in elections', 'Bumrah took five wickets in a match', 'Congress formed the state government']


Load Text Data

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Text Preprocessing
Clean → Tokenize → Stopword Removal → Lemmatize → Rejoin

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.remove('won')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

Apply preprocessing

In [None]:
clean_corpus = [preprocess_text(doc) for doc in corpus]
clean_corpus

['virat scored century match',
 'bjp won election',
 'bumrah took five wicket match',
 'congress formed state government']

TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(clean_corpus)

TF-IDF as DataFrame

In [None]:
import pandas as pd

tfidf_df = pd.DataFrame(
    tfidf.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df

Unnamed: 0,bjp,bumrah,century,congress,election,five,formed,government,match,scored,state,took,virat,wicket,won
0,0.0,0.0,0.525473,0.0,0.0,0.0,0.0,0.0,0.414289,0.525473,0.0,0.0,0.525473,0.0,0.0
1,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735
2,0.0,0.465162,0.0,0.0,0.0,0.465162,0.0,0.0,0.366739,0.0,0.0,0.465162,0.0,0.465162,0.0
3,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0


Apply NMF

In [None]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=2, random_state=42)
nmf.fit(tfidf)

Identify Words for Each Topic

In [None]:
def display_topics(model, feature_names, top_words):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx + 1}:")
        print(" ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

In [None]:
display_topics(
    nmf,
    tfidf_vectorizer.get_feature_names_out(),
    5
)


Topic 1:
match scored virat century took

Topic 2:
state formed government congress election


Topic Modeling (Document → Topic)

In [None]:
doc_topic = nmf.transform(tfidf)

topic_df = pd.DataFrame(
    doc_topic,
    columns=["Sports Topic", "Politics Topic"]
)

topic_df
topic_df['Dominant_Topic'] = topic_df.idxmax(axis=1)
topic_df

Unnamed: 0,Sports Topic,Politics Topic,Dominant_Topic
0,0.589529,0.0,Sports Topic
1,0.000472,0.1121,Politics Topic
2,0.589529,0.0,Sports Topic
3,0.0,0.840193,Politics Topic


NMF with kaggle(TF-IDF)


In [None]:
import pandas as pd
data=pd.read_csv('/content/arxiv_data.csv')
display(data.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


Import required libraries

In [None]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
corpus = data['titles'].dropna().tolist()

Text Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)     # remove numbers & special characters
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

Apply preprocessing

In [None]:
clean_corpus = [preprocess_text(doc) for doc in corpus]

TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2
)

tfidf = tfidf_vectorizer.fit_transform(clean_corpus)

TF-IDF as DataFrame

In [None]:
tfidf_df = pd.DataFrame(
    tfidf.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df.head()

Unnamed: 0,aaa,aadnet,aaformer,aamdrl,aaseg,abandoning,abc,abcnet,abdmot,abdnet,...,zhu,zigzag,zone,zoo,zoom,zoomin,zooming,zoomintocheck,zoomnet,zootuning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Apply NMF

In [None]:
nmf = NMF(
    n_components=5,
    random_state=42
)

nmf.fit(tfidf)

Identify Words for Each Topic

In [None]:
def display_topics(model, feature_names, top_words):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx + 1}:")
        print(" ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

In [None]:
display_topics(
    nmf,
    tfidf_vectorizer.get_feature_names_out(),
    top_words=10
)


Topic 1:
network neural graph convolutional deep attention recurrent classification using prediction

Topic 2:
learning reinforcement deep representation transfer unsupervised via model data using

Topic 3:
detection object video salient point cloud feature anomaly tracking monocular

Topic 4:
image segmentation semantic medical using based classification transformer attention generation

Topic 5:
adversarial generative network model using conditional attack generation training data


Topic Modeling (Document → Topic)

In [None]:
doc_topic = nmf.transform(tfidf)

topic_df = pd.DataFrame(
    doc_topic,
    columns=[f"Topic {i+1}" for i in range(nmf.n_components)]
)

topic_df.head()
topic_df['Dominant_Topic'] = topic_df.idxmax(axis=1)
topic_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Dominant_Topic
0,0.0,0.003676,0.005023,0.037947,0.0,Topic 4
1,0.000156,0.000777,0.0,0.00897,0.0,Topic 4
2,0.0,0.0,0.0,0.069807,0.0,Topic 4
3,0.0,0.000821,0.0,0.028108,0.0,Topic 4
4,0.0,0.0,0.0,0.024694,0.0,Topic 4
