# NLP Assignment No. 03

## Title: Perform text cleaning, lemmatization, remove stopwords, label encoding, TF-IDF representation, and save outputs

In [None]:
# Install spaCy and TextBlob if not already installed
!pip install -q spacy textblob
!python -m textblob.download_corpora
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import pickle
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import spacy
from textblob import TextBlob

from google.colab import files

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Upload dataset
uploaded = files.upload()

Saving News_dataset.pickle to News_dataset (1).pickle


In [None]:
# Load dataset
with open("News_dataset.pickle", "rb") as f:
    data = pickle.load(f)

df = pd.DataFrame(data)
print("Dataset Preview:")
print(df.head())

Dataset Preview:
  File_Name                                            Content  Category  \
0   001.txt  Ad sales boost Time Warner profit\r\n\r\nQuart...  business   
1   002.txt  Dollar gains on Greenspan speech\r\n\r\nThe do...  business   
2   003.txt  Yukos unit buyer faces loan claim\r\n\r\nThe o...  business   
3   004.txt  High fuel prices hit BA's profits\r\n\r\nBriti...  business   
4   005.txt  Pernod takeover talk lifts Domecq\r\n\r\nShare...  business   

  Complete_Filename  id  News_length  
0  001.txt-business   1         2569  
1  002.txt-business   1         2257  
2  003.txt-business   1         1557  
3  004.txt-business   1         2421  
4  005.txt-business   1         1575  


In [None]:
# Check column names
print("\nColumns:", df.columns)


Columns: Index(['File_Name', 'Content', 'Category', 'Complete_Filename', 'id',
       'News_length'],
      dtype='object')


In [None]:
# Select text and label column
text_column = 'File_Name'  # Changed from 'text' to 'Article' - ensure this matches your DataFrame column
label_column = 'Category'

In [None]:
# Stopwords
stop_words = set(stopwords.words('english'))

# --- Lemmatization Methods ---

In [None]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'\@w+|\#','', text)  # remove @ and #
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    return text

In [None]:
# 1. WordNet Lemmatizer
wn_lemmatizer = WordNetLemmatizer()
def lemmatize_wordnet(text):
    tokens = nltk.word_tokenize(clean_text(text))
    return ' '.join([wn_lemmatizer.lemmatize(w) for w in tokens if w not in stop_words])

In [None]:
# 2. TextBlob Lemmatizer
def lemmatize_textblob(text):
    blob = TextBlob(clean_text(text))
    return ' '.join([word.lemmatize() for word in blob.words if word not in stop_words])

In [None]:
# 3. spaCy Lemmatizer
def lemmatize_spacy(text):
    doc = nlp(clean_text(text))
    return ' '.join([token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct])


In [None]:
# Apply any one method here:
print("\nApplying lemmatization using spaCy...")
df['clean_text'] = df[text_column].apply(lemmatize_spacy)  # You can switch to lemmatize_wordnet or lemmatize_textblob


Applying lemmatization using spaCy...


In [None]:
df['clean_text'] ,df[text_column]

(0       txt
 1       txt
 2       txt
 3       txt
 4       txt
        ... 
 2220    txt
 2221    txt
 2222    txt
 2223    txt
 2224    txt
 Name: clean_text, Length: 2225, dtype: object,
 0       001.txt
 1       002.txt
 2       003.txt
 3       004.txt
 4       005.txt
          ...   
 2220    397.txt
 2221    398.txt
 2222    399.txt
 2223    400.txt
 2224    401.txt
 Name: File_Name, Length: 2225, dtype: object)

In [None]:
# Lemmatizer and Stopword Remover
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = nltk.word_tokenize(clean_text(text))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Apply preprocessing
print("\nCleaning and preprocessing text...")
df['clean_text'] = df[text_column].apply(preprocess)


Cleaning and preprocessing text...


In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df[label_column])

print("\nLabel Encoding Map:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Label Encoding Map:
{'business': np.int64(0), 'entertainment': np.int64(1), 'politics': np.int64(2), 'sport': np.int64(3), 'tech': np.int64(4)}


In [None]:
# TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

print("\nTF-IDF Shape:", tfidf_matrix.shape)


TF-IDF Shape: (2225, 1)


In [None]:
# Save processed outputs
df.to_csv("cleaned_news.csv", index=False)
with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("\n✅ Outputs saved: 'cleaned_news.csv', 'tfidf_matrix.pkl', 'label_encoder.pkl'")


✅ Outputs saved: 'cleaned_news.csv', 'tfidf_matrix.pkl', 'label_encoder.pkl'
