# NLP Assignment 3 : Text Preprocessing & TF-IDF

This notebook performs:
- Text Cleaning
- Stopword Removal
- Lemmatization
- Label Encoding
- TF-IDF Feature Extraction
- Saving Outputs

In [1]:
!pip install nltk scikit-learn pandas





[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:

# Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:

# Sample dataset
data = {
    "text": [
        "I love machine learning!",
        "NLP is amazing and very useful.",
        "I hate spam emails.",
        "Machine learning is the future."
    ],
    "label": ["positive", "positive", "negative", "positive"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,I love machine learning!,positive
1,NLP is amazing and very useful.,positive
2,I hate spam emails.,negative
3,Machine learning is the future.,positive


In [5]:

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [6]:

# Stopword removal and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)


In [7]:

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(clean_text)
df['processed_text'] = df['cleaned_text'].apply(preprocess_text)
df


Unnamed: 0,text,label,cleaned_text,processed_text
0,I love machine learning!,positive,i love machine learning,love machine learning
1,NLP is amazing and very useful.,positive,nlp is amazing and very useful,nlp amazing useful
2,I hate spam emails.,negative,i hate spam emails,hate spam email
3,Machine learning is the future.,positive,machine learning is the future,machine learning future


In [8]:

# Label Encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
df[['label', 'label_encoded']]


Unnamed: 0,label,label_encoded
0,positive,1
1,positive,1
2,negative,0
3,positive,1


In [9]:

# TF-IDF Representation
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['processed_text'])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df


Unnamed: 0,amazing,email,future,hate,learning,love,machine,nlp,spam,useful
0,0.0,0.0,0.0,0.0,0.526405,0.667679,0.526405,0.0,0.0,0.0
1,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735
2,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0
3,0.0,0.0,0.667679,0.0,0.526405,0.0,0.526405,0.0,0.0,0.0


In [10]:

# Save outputs
df.to_csv("processed_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)

import pickle
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("All outputs saved successfully.")


All outputs saved successfully.
