<a href="https://colab.research.google.com/github/BandiSreesaicharan/NLP/blob/main/NLP_Lab_Assignment_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import re
import pandas as pd

# Load dataset (first 1000 rows for safety)
df = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000)

# Define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs (http, https, www)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove social media mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove special characters (keep only alphanumeric and spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Normalize whitespace (reduce multiple spaces to single space)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to 'summaries' column
df['processed_summaries'] = df['summaries'].apply(preprocess_text)

# Preview results
print(df[['summaries', 'processed_summaries']].head())


                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                                 processed_summaries  
0  stereo matching is one of the widely used tech...  
1  the recent advancements in artificial intellig...  
2  in this paper we proposed a novel mutual consi...  
3  consistency training has proven to be an advan...  
4  to ensure safety in automated driving the corr...  


In [12]:
df['processed_summaries'] = df['summaries'].apply(preprocess_text)


In [13]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')  # required in newer NLTK versions

df['tokenized_summaries'] = df['processed_summaries'].apply(lambda x: word_tokenize(x))
print(df[['processed_summaries', 'tokenized_summaries']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                 processed_summaries  \
0  stereo matching is one of the widely used tech...   
1  the recent advancements in artificial intellig...   
2  in this paper we proposed a novel mutual consi...   
3  consistency training has proven to be an advan...   
4  to ensure safety in automated driving the corr...   

                                 tokenized_summaries  
0  [stereo, matching, is, one, of, the, widely, u...  
1  [the, recent, advancements, in, artificial, in...  
2  [in, this, paper, we, proposed, a, novel, mutu...  
3  [consistency, training, has, proven, to, be, a...  
4  [to, ensure, safety, in, automated, driving, t...  


In [14]:
import nltk
from nltk.corpus import stopwords

# Ensure the stopwords resource is available
nltk.download('stopwords')

# Define the set of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokenized_summaries
df['filtered_summaries'] = df['tokenized_summaries'].apply(
    lambda tokens: [w for w in tokens if w.lower() not in stop_words]
)

# Preview results
print(df[['tokenized_summaries', 'filtered_summaries']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                 tokenized_summaries  \
0  [stereo, matching, is, one, of, the, widely, u...   
1  [the, recent, advancements, in, artificial, in...   
2  [in, this, paper, we, proposed, a, novel, mutu...   
3  [consistency, training, has, proven, to, be, a...   
4  [to, ensure, safety, in, automated, driving, t...   

                                  filtered_summaries  
0  [stereo, matching, one, widely, used, techniqu...  
1  [recent, advancements, artificial, intelligenc...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


In [15]:
import nltk
from nltk.stem import WordNetLemmatizer

# Ensure the WordNet corpus is available
nltk.download('wordnet')
nltk.download('omw-1.4')  # optional, improves lemmatization coverage

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to each token list
df['lemmatized_summaries'] = df['filtered_summaries'].apply(
    lambda tokens: [lemmatizer.lemmatize(w) for w in tokens]
)

# Preview results
print(df[['filtered_summaries', 'lemmatized_summaries']].head())


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                  filtered_summaries  \
0  [stereo, matching, one, widely, used, techniqu...   
1  [recent, advancements, artificial, intelligenc...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                lemmatized_summaries  
0  [stereo, matching, one, widely, used, techniqu...  
1  [recent, advancement, artificial, intelligence...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


In [16]:
# Rejoin lemmatized words into a single string
df['clean_summaries'] = df['lemmatized_summaries'].apply(lambda tokens: ' '.join(tokens))

# Preview results
print(df[['lemmatized_summaries', 'clean_summaries']].head())


                                lemmatized_summaries  \
0  [stereo, matching, one, widely, used, techniqu...   
1  [recent, advancement, artificial, intelligence...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                     clean_summaries  
0  stereo matching one widely used technique infe...  
1  recent advancement artificial intelligence ai ...  
2  paper proposed novel mutual consistency networ...  
3  consistency training proven advanced semisuper...  
4  ensure safety automated driving correct percep...  


In [17]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('punkt_tab')   # required in newer NLTK versions
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Step 1: Define preprocessing function (regex cleaning)
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove special characters (keep alphanumeric + spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Step 2: Unified NLTK preprocessing pipeline
def nltk_preprocessing_pipeline(text):
    # Regex cleaning
    cleaned = preprocess_text(text)

    # Tokenization
    tokens = word_tokenize(cleaned)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if w not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]

    # Rejoin words
    final_text = ' '.join(lemmatized_tokens)

    return final_text

# Step 3: Apply pipeline to dataset
df = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000)
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)

# Preview results
print(df[['summaries', 'clean_summaries_pipeline']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                            clean_summaries_pipeline  
0  stereo matching one widely used technique infe...  
1  recent advancement artificial intelligence ai ...  
2  paper proposed novel mutual consistency networ...  
3  consistency training proven advanced semisuper...  
4  ensure safety automated driving correct percep...  


In [18]:
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('punkt_tab')   # required in newer NLTK versions
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')  # required in newer NLTK versions

# Load dataset (first 1000 rows for safety)
# df = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000) # Comment this line out to avoid overwriting df

# Step 1: Tokenize summaries (if not already done)
df['tokenized_summaries'] = df['summaries'].apply(lambda x: word_tokenize(str(x)))

# Step 2: Apply POS tagging to all tokenized summaries
df['pos_summaries'] = df['tokenized_summaries'].apply(lambda tokens: pos_tag(tokens))

# Preview results
print(df[['tokenized_summaries', 'pos_summaries']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


                                 tokenized_summaries  \
0  [Stereo, matching, is, one, of, the, widely, u...   
1  [The, recent, advancements, in, artificial, in...   
2  [In, this, paper, ,, we, proposed, a, novel, m...   
3  [Consistency, training, has, proven, to, be, a...   
4  [To, ensure, safety, in, automated, driving, ,...   

                                       pos_summaries  
0  [(Stereo, NNP), (matching, NN), (is, VBZ), (on...  
1  [(The, DT), (recent, JJ), (advancements, NNS),...  
2  [(In, IN), (this, DT), (paper, NN), (,, ,), (w...  
3  [(Consistency, NN), (training, NN), (has, VBZ)...  
4  [(To, TO), (ensure, VB), (safety, NN), (in, IN...  


In [19]:
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)


In [20]:
import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
doc = nlp(df['clean_summaries_pipeline'][0])
nouns = []
verbs = []
for token in doc:
 if token.pos_ in ["NOUN", "PROPN"]:
  nouns.append(token.text)
 elif token.pos_ == "VERB":
  verbs.append(token.text)
noun_freq = Counter(nouns)
verb_freq = Counter(verbs)
print("Noun Frequency:", noun_freq)
print("Verb Frequency:", verb_freq)

Noun Frequency: Counter({'stereo': 5, 'matching': 3, 'image': 2, 'speed': 2, 'application': 2, 'segmentation': 2, 'network': 2, 'term': 2, 'technique': 1, 'depth': 1, 'topic': 1, 'research': 1, 'find': 1, 'navigation': 1, '3d': 1, 'reconstruction': 1, 'field': 1, 'correspondence': 1, 'area': 1, 'challenge': 1, 'development': 1, 'cue': 1, 'result': 1, 'architecture': 1, 'leverage': 1, 'advantage': 1, 'paper': 1, 'aim': 1, 'comparison': 1, 'state': 1, 'art': 1, 'accuracy': 1, 'importance': 1, 'realtime': 1})
Verb Frequency: Counter({'used': 2, 'matching': 1, 'inferring': 1, 'owing': 1, 'become': 1, 'driving': 1, 'finding': 1, 'nontextured': 1, 'shown': 1, 'improve': 1, 'proposed': 1, 'give': 1})


In [22]:
import pandas as pd
import nltk
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from collections import Counter

# Ensure resources are available
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

# Reload dataset safely
df = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000)

# Step 1: Tokenize
df['tokenized_summaries'] = df['summaries'].apply(lambda x: word_tokenize(str(x)))

# Step 2: POS tagging
df['pos_summaries'] = df['tokenized_summaries'].apply(lambda tokens: pos_tag(tokens))

# Step 3: NER
def extract_entities(pos_tags):
    chunked = ne_chunk(pos_tags)
    entities = []
    for subtree in chunked:
        if hasattr(subtree, 'label'):
            entity = " ".join([token for token, pos in subtree.leaves()])
            entities.append((entity, subtree.label()))
    return entities

# Apply to a small sample first to test
df_sample = df.head(10).copy()
df_sample['named_entities'] = df_sample['pos_summaries'].apply(extract_entities)

print("Sample named entities:")
print(df_sample[['summaries', 'named_entities']])

# Step 4: Frequency analysis (on sample for speed)
all_entities = []
for ents in df_sample['named_entities']:
    all_entities.extend([entity for entity, label in ents])

entity_freq = Counter(all_entities)
print("\nTop 10 most frequent named entities in sample:")
print(entity_freq.most_common(10))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Sample named entities:
                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   
5  High-quality training data play a key role in ...   
6  Semantic segmentation of fine-resolution urban...   
7  To mitigate the radiologist's workload, comput...   
8  Generalising deep models to new data from new ...   
9  The success of deep learning methods in medica...   

                                      named_entities  
0                                    [(Stereo, GPE)]  
1  [(AI, ORGANIZATION), (AI, ORGANIZATION), (Euro...  
2                                                 []  
3  [(Consistency, GSP), (Atrial Segmentation, ORG...  
4  [(Gaussian Mixture Models, PERSON), (GMM, ORGA...  
5                         [(Ed