<a href="https://colab.research.google.com/github/BandiSreesaicharan/NLP/blob/main/NLP_Lab_Assignment_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Task #1**

In [4]:
import pandas as pd
df = pd.read_excel('/content/sample.xlsx')
df

Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumra took five wickets in a match
3,Congress form state government


In [5]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    # Initialize NLTK tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # 1. Preprocess text (from previous steps)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    text = text.lower()  # Convert to lowercase

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # 2. Word Tokenization
    tokenized_words = word_tokenize(text)

    # 3. Stopword Removal
    filtered_words = [word for word in tokenized_words if word not in stop_words]

    # 4. Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # 5. Rejoin words
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...


NLTK preprocessing pipeline function created successfully!


[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
df['Clean_News'] = df['News'].apply(nltk_preprocessing_pipeline)
df

Unnamed: 0,News,Clean_News
0,Virat scored century in match,virat scored century match
1,BJP won in elections,bjp election
2,Bumra took five wickets in a match,bumra took five wicket match
3,Congress form state government,congress form state government


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the cleaned summaries
count_vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df['Clean_News'])

In [8]:
feature_names = count_vectorizer.get_feature_names_out()
doc_term_matrix = pd.DataFrame(doc_term_matrix.toarray(), columns=feature_names)
doc_term_matrix

Unnamed: 0,bjp,bumra,century,congress,election,form,government,match,scored,state,took,virat,wicket
0,0,0,1,0,0,0,0,1,1,0,0,1,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,1,0,1
3,0,0,0,1,0,1,1,0,0,1,0,0,0


## **LDM**

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
num_topics = 2
LDA = LatentDirichletAllocation(n_components=num_topics, random_state=42)
LDA.fit(doc_term_matrix)

In [10]:
len(LDA.components_)

2

In [11]:
#Give weights
for topic_idx in range(len(LDA.components_)):
    print(f'Topic: {topic_idx + 1}')
    topic_weight = LDA.components_[topic_idx].argsort()
    for idx in topic_weight:
      print(feature_names[idx],end=' ')
    print()

Topic: 1
virat century scored took bumra wicket match bjp election state congress government form 
Topic: 2
form government congress state election bjp wicket bumra took scored century virat match 


In [12]:
# Assign topics to each document
document_topics = LDA.transform(doc_term_matrix)
df['topic_LLM'] = document_topics.argmax(axis=1)
df[['News','topic_LLM']]

Unnamed: 0,News,topic_LLM
0,Virat scored century in match,1
1,BJP won in elections,0
2,Bumra took five wickets in a match,1
3,Congress form state government,0


## **NMF**

In [13]:
from sklearn.decomposition import NMF
num_topics = 2
NMF = NMF(n_components=num_topics, random_state=42)
NMF.fit(doc_term_matrix)
print('NMF Model implemented and fitted Successfully')

NMF Model implemented and fitted Successfully


In [14]:
document_topics = NMF.transform(doc_term_matrix)
df['topic_NMF'] = document_topics.argmax(axis=1)
df[['News','topic_NMF']]

Unnamed: 0,News,topic_NMF
0,Virat scored century in match,0
1,BJP won in elections,1
2,Bumra took five wickets in a match,0
3,Congress form state government,1


In [15]:
df

Unnamed: 0,News,Clean_News,topic_LLM,topic_NMF
0,Virat scored century in match,virat scored century match,1,0
1,BJP won in elections,bjp election,0,1
2,Bumra took five wickets in a match,bumra took five wicket match,1,0
3,Congress form state government,congress form state government,0,1


# Task #**2**

In [16]:
df2 = pd.read_csv('/content/arxiv_data.csv', nrows=1000)
df2.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [17]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    # Initialize NLTK tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # 1. Preprocess text (from previous steps)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    text = text.lower()  # Convert to lowercase

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # 2. Word Tokenization
    tokenized_words = word_tokenize(text)

    # 3. Stopword Removal
    filtered_words = [word for word in tokenized_words if word not in stop_words]

    # 4. Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # 5. Rejoin words
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")


NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
df2['summaries'] = df2['summaries'].apply(nltk_preprocessing_pipeline)
df2['titles'] = df2['titles'].apply(nltk_preprocessing_pipeline)
df2.head()

Unnamed: 0,titles,summaries,terms
0,survey semantic stereo matching semantic depth...,stereo matching one widely used technique infe...,"['cs.CV', 'cs.LG']"
1,futureai guiding principle consensus recommend...,recent advancement artificial intelligence ai ...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,enforcing mutual consistency hard region semis...,paper proposed novel mutual consistency networ...,"['cs.CV', 'cs.AI']"
3,parameter decoupling strategy semisupervised 3...,consistency training proven advanced semisuper...,['cs.CV']
4,backgroundforeground segmentation interior sen...,ensure safety automated driving correct percep...,"['cs.CV', 'cs.LG']"


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df2['summaries'])

In [21]:
# Get feature (word) names
feature_names = count_vectorizer.get_feature_names_out()
# Convert matrix to DataFrame
bow_df = pd.DataFrame(doc_term_matrix.toarray(),columns=feature_names)
bow_df.head()

Unnamed: 0,001074,002,003,00486,005,007fps,0088,01,011,012,...,youtube,youtubevos,zebrafish,zernike,zero,zerodice,zijdenbos,zone,zsi,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**LDA**

In [22]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize and fit LDA model
num_topics = 2
LDA = LatentDirichletAllocation(n_components=num_topics, random_state=42)
LDA.fit(doc_term_matrix)

In [24]:
# Assign topics to each document
document_topics = LDA.transform(doc_term_matrix)
df2['topic_LDA'] = document_topics.argmax(axis=1)
df2[['summaries','topic_LDA']]

Unnamed: 0,summaries,topic_LDA
0,stereo matching one widely used technique infe...,0
1,recent advancement artificial intelligence ai ...,0
2,paper proposed novel mutual consistency networ...,1
3,consistency training proven advanced semisuper...,1
4,ensure safety automated driving correct percep...,0
...,...,...
995,accurate medical image segmentation essential ...,1
996,isointense stage accurate volumetric image seg...,1
997,complex segmentation task fully automatic syst...,1
998,lowshot learning method image classification s...,1


**NMF**

In [26]:
from sklearn.decomposition import NMF

# Initialize NMF model
num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(doc_term_matrix)
print("NMF model initialized and fitted successfully.")

NMF model initialized and fitted successfully.


In [28]:
document_topics = nmf_model.transform(doc_term_matrix)
df2['topic_NMF'] = document_topics.argmax(axis=1)
df2[['summaries','topic_NMF']]

Unnamed: 0,summaries,topic_NMF
0,stereo matching one widely used technique infe...,1
1,recent advancement artificial intelligence ai ...,0
2,paper proposed novel mutual consistency networ...,0
3,consistency training proven advanced semisuper...,0
4,ensure safety automated driving correct percep...,0
...,...,...
995,accurate medical image segmentation essential ...,1
996,isointense stage accurate volumetric image seg...,1
997,complex segmentation task fully automatic syst...,1
998,lowshot learning method image classification s...,1
