<a href="https://colab.research.google.com/github/Ashritha0848/NLP/blob/main/2403A52229%2CAssignment_6_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import re
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF


In [29]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)


In [None]:
Task 1

In [33]:
df_excel = pd.read_excel("/content/LDA-Data.xlsx")   # Sample dataset
docs = df_excel['News'].astype(str).tolist()

In [34]:
clean_docs = [preprocess(doc) for doc in docs]


In [35]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(clean_docs)


In [36]:
lda_model = LatentDirichletAllocation(
    n_components=3,
    random_state=42
)

lda_output = lda_model.fit_transform(bow_matrix)


In [37]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(lda_model.components_):
    print(f"\nTask 1 – Topic {i+1}:")
    print([words[j] for j in topic.argsort()[-10:]])



Task 1 – Topic 1:
['state', 'match', 'century', 'scored', 'virat', 'wicket', 'took', 'bumra', 'bjp', 'election']

Task 1 – Topic 2:
['state', 'election', 'bjp', 'took', 'bumra', 'wicket', 'century', 'virat', 'scored', 'match']

Task 1 – Topic 3:
['scored', 'wicket', 'took', 'bumra', 'bjp', 'election', 'government', 'congress', 'form', 'state']


In [44]:
# Get dominant topic for each document
df_excel['Dominant_Topic'] = np.argmax(lda_output, axis=1) + 1

# View statement + topic
df_excel[['News', 'Dominant_Topic']].head()


Unnamed: 0,News,Dominant_Topic
0,Virat scored century in match,2
1,BJP won in elections,1
2,Bumra took 5 wicket in a match,2
3,Congress form state government,3


Task 2

In [67]:
df_csv = pd.read_csv("/content/arxiv_data.csv")   # Kaggle dataset
docs = df_csv['summaries'].astype(str).tolist()

In [69]:

clean_docs = [preprocess(doc) for doc in docs]
print("Clean docs:", len(clean_docs))


Clean docs: 51774


In [71]:
vectorizer = CountVectorizer(max_features=5000)
bow_matrix = vectorizer.fit_transform(clean_docs)

In [83]:
lda_model = LatentDirichletAllocation(
    n_components=5,
    random_state=42
)

lda_output = lda_model.fit_transform(bow_matrix)

In [84]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(lda_model.components_):
    print(f"\nTask 2 – Topic {i+1}:")
    print([words[j] for j in topic.argsort()[-10:]])


Task 2 – Topic 1:
['series', 'machine', 'deep', 'approach', 'system', 'method', 'time', 'learning', 'model', 'data']

Task 2 – Topic 2:
['representation', 'function', 'task', 'reinforcement', 'policy', 'problem', 'method', 'algorithm', 'graph', 'learning']

Task 2 – Topic 3:
['performance', 'video', 'architecture', 'method', 'neural', 'task', 'attention', 'feature', 'model', 'network']

Task 2 – Topic 4:
['generative', 'learning', 'domain', 'training', 'data', 'adversarial', 'network', 'method', 'model', 'image']

Task 2 – Topic 5:
['approach', 'scene', 'feature', 'network', 'point', 'segmentation', 'detection', 'method', 'image', 'object']


In [85]:
lda_topics = pd.DataFrame(
    lda_output,
    columns=[f"Topic_{i+1}" for i in range(lda_output.shape[1])]
)

df_with_topics = pd.concat([df_csv.reset_index(drop=True), lda_topics], axis=1)

df_with_topics['Dominant_Topic'] = lda_topics.idxmax(axis=1)


In [87]:
df_with_topics[['summaries', 'Dominant_Topic']].head()


Unnamed: 0,summaries,Dominant_Topic
0,Stereo matching is one of the widely used tech...,Topic_5
1,The recent advancements in artificial intellig...,Topic_1
2,"In this paper, we proposed a novel mutual cons...",Topic_4
3,Consistency training has proven to be an advan...,Topic_4
4,"To ensure safety in automated driving, the cor...",Topic_5


Task 3

In [47]:
df_excel = pd.read_excel("/content/LDA-Data.xlsx")
docs = df_excel['News'].astype(str).tolist()


In [48]:
clean_docs = [preprocess(doc) for doc in docs]


In [49]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(clean_docs)


In [50]:
nmf_model = NMF(
    n_components=3,
    random_state=42
)

nmf_output = nmf_model.fit_transform(bow_matrix)


In [51]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(nmf_model.components_):
    print(f"\nTask 3 – Topic {i+1}:")
    print([words[j] for j in topic.argsort()[-10:]])



Task 3 – Topic 1:
['election', 'government', 'form', 'took', 'state', 'wicket', 'century', 'virat', 'scored', 'match']

Task 3 – Topic 2:
['scored', 'wicket', 'virat', 'took', 'bjp', 'election', 'government', 'congress', 'form', 'state']

Task 3 – Topic 3:
['form', 'state', 'virat', 'scored', 'election', 'bjp', 'match', 'bumra', 'took', 'wicket']


In [53]:
# Get dominant topic for each document
df_excel['Dominant_Topic'] = np.argmax(nmf_output, axis=1) + 1

# View statement + topic
df_excel[['News', 'Dominant_Topic']].head()


Unnamed: 0,News,Dominant_Topic
0,Virat scored century in match,1
1,BJP won in elections,3
2,Bumra took 5 wicket in a match,3
3,Congress form state government,2


Task 4

In [57]:
df_csv = pd.read_csv("/content/arxiv_data.csv")
docs = df_csv['summaries'].astype(str).tolist()

In [63]:
clean_docs = [preprocess(doc) for doc in docs]

In [70]:
vectorizer = CountVectorizer(max_features=5000)
bow_matrix = vectorizer.fit_transform(clean_docs)


In [74]:
nmf_model = NMF(
    n_components=5,
    random_state=42
)

nmf_output = nmf_model.fit_transform(bow_matrix)


In [75]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(nmf_model.components_):
    print(f"\nTask 4 – Topic {i+1}:")
    print([words[j] for j in topic.argsort()[-10:]])



Task 4 – Topic 1:
['reinforcement', 'approach', 'problem', 'representation', 'deep', 'task', 'algorithm', 'method', 'data', 'learning']

Task 4 – Topic 2:
['color', 'adversarial', 'training', 'proposed', 'using', 'result', 'method', 'network', 'segmentation', 'image']

Task 4 – Topic 3:
['convolutional', 'classification', 'task', 'information', 'structure', 'representation', 'node', 'neural', 'network', 'graph']

Task 4 – Topic 4:
['using', 'neural', 'show', 'network', 'performance', 'prediction', 'time', 'training', 'data', 'model']

Task 4 – Topic 5:
['performance', 'video', 'propose', 'proposed', 'point', 'network', 'detection', 'feature', 'object', 'method']


In [78]:
df_csv['Dominant_Topic'] = np.argmax(nmf_output, axis=1) + 1
df_csv[['summaries', 'Dominant_Topic']].head()

Unnamed: 0,summaries,Dominant_Topic
0,Stereo matching is one of the widely used tech...,2
1,The recent advancements in artificial intellig...,2
2,"In this paper, we proposed a novel mutual cons...",4
3,Consistency training has proven to be an advan...,4
4,"To ensure safety in automated driving, the cor...",5


Task 5

In [2]:
import pandas as pd

# Load Excel file
df = pd.read_excel("/content/LDA-Data.xlsx")   # upload this file in Colab

# Extract text column
corpus = df['News'].astype(str).tolist()

print("Number of documents:", len(corpus))

Number of documents: 4


In [3]:
import re
import nltk
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove numbers & special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Stopword removal & lemmatization
    tokens = [lemmatizer.lemmatize(word)
              for word in tokens if word not in stop_words]

    # Rejoin
    return " ".join(tokens)

# Apply preprocessing
clean_corpus = [preprocess_text(doc) for doc in corpus]

print(clean_corpus[:3])

['virat scored century match', 'bjp election', 'bumra took wicket match']


In [9]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

tfidf_matrix = tfidf_vectorizer.fit_transform(clean_corpus)


In [10]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df.head()


Unnamed: 0,bjp,bjp election,bumra,bumra took,century,century match,congress,congress form,election,form,...,scored,scored century,state,state government,took,took wicket,virat,virat scored,wicket,wicket match
0,0.0,0.0,0.0,0.0,0.388614,0.388614,0.0,0.0,0.0,0.0,...,0.388614,0.388614,0.0,0.0,0.0,0.0,0.388614,0.388614,0.0,0.0
1,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.388614,0.388614,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.388614,0.388614,0.0,0.0,0.388614,0.388614
3,0.0,0.0,0.0,0.0,0.0,0.0,0.377964,0.377964,0.0,0.377964,...,0.0,0.0,0.377964,0.377964,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
num_topics = 5   # you can change this

nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)

nmf_matrix = nmf_model.fit_transform(tfidf_matrix)


In [12]:
feature_names = tfidf_vectorizer.get_feature_names_out()
num_top_words = 10

topics = {}

for topic_idx, topic in enumerate(nmf_model.components_):
    top_features = [feature_names[i]
                    for i in topic.argsort()[:-num_top_words-1:-1]]

    topics[f"Topic {topic_idx+1}"] = top_features

topics


{'Topic 1': ['government',
  'form',
  'congress form',
  'state government',
  'congress',
  'state',
  'form state',
  'took',
  'took wicket',
  'virat'],
 'Topic 2': ['bjp',
  'bjp election',
  'election',
  'virat',
  'wicket match',
  'wicket',
  'virat scored',
  'state government',
  'took',
  'took wicket'],
 'Topic 3': ['form state',
  'state',
  'congress',
  'state government',
  'congress form',
  'form',
  'government',
  'took',
  'took wicket',
  'virat'],
 'Topic 4': ['virat scored',
  'virat',
  'scored century',
  'century',
  'century match',
  'scored',
  'match',
  'state government',
  'took',
  'took wicket'],
 'Topic 5': ['wicket match',
  'wicket',
  'took wicket',
  'took',
  'bumra took',
  'bumra',
  'match',
  'state government',
  'virat',
  'virat scored']}

In [13]:
for topic, words in topics.items():
    print(f"\n{topic}:")
    print(", ".join(words))



Topic 1:
government, form, congress form, state government, congress, state, form state, took, took wicket, virat

Topic 2:
bjp, bjp election, election, virat, wicket match, wicket, virat scored, state government, took, took wicket

Topic 3:
form state, state, congress, state government, congress form, form, government, took, took wicket, virat

Topic 4:
virat scored, virat, scored century, century, century match, scored, match, state government, took, took wicket

Topic 5:
wicket match, wicket, took wicket, took, bumra took, bumra, match, state government, virat, virat scored


In [14]:
df['Dominant_Topic'] = np.argmax(nmf_matrix, axis=1) + 1
df.head()


Unnamed: 0,News,Dominant_Topic
0,Virat scored century in match,4
1,BJP won in elections,2
2,Bumra took 5 wicket in a match,5
3,Congress form state government,1


Task 6

In [17]:
import pandas as pd

# Load CSV file
df = pd.read_csv("/content/arxiv_data.csv")   # upload in Colab

# Extract text column
corpus = df['summaries'].astype(str).tolist()

print("Number of documents:", len(corpus))

Number of documents: 51774


In [18]:
import re
import nltk
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [20]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word)
              for word in tokens if word not in stop_words]

    return " ".join(tokens)

# Apply preprocessing
clean_corpus = [preprocess_text(doc) for doc in corpus]

print(clean_corpus[:3])


['stereo matching one widely used technique inferring depth stereo image owing robustness speed become one major topic research since find application autonomous driving robotic navigation reconstruction many field finding pixel correspondence nontextured occluded reflective area major challenge stereo matching recent development shown semantic cue image segmentation used improve result stereo matching many deep neural network architecture proposed leverage advantage semantic segmentation stereo matching paper aim give comparison among state art network term accuracy term speed higher importance realtime application', 'recent advancement artificial intelligence ai combined extensive amount data generated today clinical system led development imaging ai solution across whole value chain medical imaging including image reconstruction medical image segmentation imagebased diagnosis treatment planning notwithstanding success future potential ai medical imaging many stakeholder concerned po

In [22]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

tfidf_matrix = tfidf_vectorizer.fit_transform(clean_corpus)


In [23]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df.head()


Unnamed: 0,ability,ablation,ablation study,able,able achieve,able generate,able learn,abnormal,abnormality,absence,...,yet effective,yield,yielded,yielding,yolo,yolov,zero,zeroshot,zeroshot learning,zsl
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.060103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
num_topics = 5   # change as needed

nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)

nmf_matrix = nmf_model.fit_transform(tfidf_matrix)


In [25]:
feature_names = tfidf_vectorizer.get_feature_names_out()
num_top_words = 10

topics = {}

for topic_idx, topic in enumerate(nmf_model.components_):
    top_words = [feature_names[i]
                 for i in topic.argsort()[:-num_top_words-1:-1]]

    topics[f"Topic {topic_idx+1}"] = top_words

topics


{'Topic 1': ['image',
  'model',
  'network',
  'data',
  'method',
  'segmentation',
  'training',
  'learning',
  'deep',
  'feature'],
 'Topic 2': ['policy',
  'learning',
  'reinforcement',
  'reinforcement learning',
  'algorithm',
  'agent',
  'rl',
  'environment',
  'reward',
  'function'],
 'Topic 3': ['graph',
  'node',
  'representation',
  'network',
  'gnns',
  'graph neural',
  'neural',
  'neural network',
  'learning',
  'structure'],
 'Topic 4': ['object',
  'detection',
  'video',
  'object detection',
  'feature',
  'detector',
  'scene',
  'frame',
  'depth',
  'method'],
 'Topic 5': ['point',
  'cloud',
  'point cloud',
  'shape',
  'lidar',
  'feature',
  'registration',
  'method',
  'local',
  'surface']}

In [26]:
for topic, words in topics.items():
    print(f"\n{topic}:")
    print(", ".join(words))



Topic 1:
image, model, network, data, method, segmentation, training, learning, deep, feature

Topic 2:
policy, learning, reinforcement, reinforcement learning, algorithm, agent, rl, environment, reward, function

Topic 3:
graph, node, representation, network, gnns, graph neural, neural, neural network, learning, structure

Topic 4:
object, detection, video, object detection, feature, detector, scene, frame, depth, method

Topic 5:
point, cloud, point cloud, shape, lidar, feature, registration, method, local, surface


In [27]:
df['Dominant_Topic'] = np.argmax(nmf_matrix, axis=1) + 1
df.head()


Unnamed: 0,titles,summaries,terms,Dominant_Topic
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']",1
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']",1
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']",1
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV'],1
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']",1
