# Assignment 05.1: Text Preprocessing with Python  

**Objective:**  
To clean and preprocess raw text data, preparing it for further analysis or input into machine 
learning models. Students will use Python libraries such as NLTK, re, and Pandas to complete 
this task.

In [1]:
# import libs and packs:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import defaultdict
import math

import chardet # for handeling utf-8 encoding err.

In [2]:
# finding the encoding: 
file_path = 'dataset/product_reviews.csv'

with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())

detected_encoding = result['encoding']
print(f"Detected encoding: {detected_encoding}")

if detected_encoding:
    try:
        df = pd.read_csv(file_path, encoding=detected_encoding)
        print(f"Successfully read with detected encoding: {detected_encoding}")
    except Exception as e:
        print(f"Failed to read with detected encoding {detected_encoding}: {e}")
else:
    print("Could not detect encoding.")


# Load data:

#df = pd.read_csv('dataset/product_reviews.csv')
df = pd.read_csv('dataset/product_reviews.csv', encoding='Windows-1252')

Detected encoding: Windows-1252
Successfully read with detected encoding: Windows-1252


In [3]:
# Download necessary NLTK data (run once)
nltk.download('stopwords')

nltk.download('wordnet')

nltk.download('punkt') 

nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Create a new column for preprocessed text
df['Processed_Text'] = df['Review_Text'].copy()


In [5]:
# Perform the following text preprocessing tasks:
def preprocess_text(text):
    # Remove leading numbers and spaces (e.g., "1 The product...")
    text = re.sub(r'^\d+\s*', '', text)
    # Convert all text to lowercase.
    text = text.lower()
    # Remove punctuation, numbers, and special characters.
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep only letters and spaces
    return text

df['Processed_Text'] = df['Processed_Text'].apply(preprocess_text)

print("After initial cleaning (lowercase, punctuation, numbers removed):")
print(df[['Review_ID', 'Processed_Text']])

After initial cleaning (lowercase, punctuation, numbers removed):
    Review_ID                                     Processed_Text
0           1  the product is great loved it but its a bit pr...
1           2     worst product ever wouldnt recommend to anyone
2           3  satisfactory quality works as expected no majo...
3           4     amazing product i would buy it again and again
4           5      the delivery was slow but the product is good
5           6  horrible experience the product broke after ju...
6           7  great value for the price definitely worth buying
7           8  the product didnt meet my expectations returni...
8           9  im satisfied with the purchase but there are b...
9          10  superb product excellent build quality and gre...
10         11  the product is just okay nothing special but i...
11         12  fast delivery and product as described would b...
12         13  not worth the money the product feels cheap an...
13         14  the produ

In [6]:
# Tokenize the text into individual words.
df['Tokens'] = df['Processed_Text'].apply(nltk.word_tokenize)

print("After Tokenization:")
print(df[['Review_ID', 'Tokens']])


# Remove stopwords.
df['Tokens_NoStopwords'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

print("After Stopword Removal:")
print(df[['Review_ID', 'Tokens_NoStopwords']])


After Tokenization:
    Review_ID                                             Tokens
0           1  [the, product, is, great, loved, it, but, its,...
1           2  [worst, product, ever, wouldnt, recommend, to,...
2           3  [satisfactory, quality, works, as, expected, n...
3           4  [amazing, product, i, would, buy, it, again, a...
4           5  [the, delivery, was, slow, but, the, product, ...
5           6  [horrible, experience, the, product, broke, af...
6           7  [great, value, for, the, price, definitely, wo...
7           8  [the, product, didnt, meet, my, expectations, ...
8           9  [im, satisfied, with, the, purchase, but, ther...
9          10  [superb, product, excellent, build, quality, a...
10         11  [the, product, is, just, okay, nothing, specia...
11         12  [fast, delivery, and, product, as, described, ...
12         13  [not, worth, the, money, the, product, feels, ...
13         14  [the, product, exceeded, my, expectations, fan...
14   

In [7]:
# Perform stemming and lemmatization.
df['Stemmed_Tokens'] = df['Tokens_NoStopwords'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])
df['Lemmatized_Tokens'] = df['Tokens_NoStopwords'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

print("After Stemming:")
print(df[['Review_ID', 'Stemmed_Tokens']])


print("After Lemmatization:")
print(df[['Review_ID', 'Lemmatized_Tokens']])


After Stemming:
    Review_ID                                     Stemmed_Tokens
0           1                [product, great, love, bit, pricey]
1           2  [worst, product, ever, wouldnt, recommend, anyon]
2           3  [satisfactori, qualiti, work, expect, major, i...
3           4                        [amaz, product, would, buy]
4           5                    [deliveri, slow, product, good]
5           6        [horribl, experi, product, broke, one, use]
6           7          [great, valu, price, definit, worth, buy]
7           8             [product, didnt, meet, expect, return]
8           9      [im, satisfi, purchas, better, option, avail]
9          10  [superb, product, excel, build, qualiti, great...
10         11     [product, okay, noth, special, get, job, done]
11         12     [fast, deliveri, product, describ, would, buy]
12         13       [worth, money, product, feel, cheap, flimsi]
13         14        [product, exceed, expect, fantast, perform]
14       

In [8]:
#For TF-IDF, it's common to use either stemmed or lemmatized tokens.
#here, we use lemmatized tokens for a more refined analysis.

# TF (Term Frequency) = 1 + log(C(w,d) + 1)
# IDF (Inverse Document Frequency) = log(N / df(w)) where N is total documents, df(w) is documents containing word w


print("Columns available before TF-IDF calculation:", df.columns) # Add this line

# For TF-IDF, it's common to use either stemmed or lemmatized tokens. Let's use lemmatized tokens for a more refined analysis.
df['Final_Tokens'] = df['Lemmatized_Tokens']


N = len(df) # Total number of documents (reviews)

df_counts = defaultdict(int)
for _, row in df.iterrows():
    unique_words_in_doc = set(row['Final_Tokens'])
    for word in unique_words_in_doc:
        df_counts[word] += 1


idf_scores = {}
for word, count in df_counts.items():
    idf_scores[word] = math.log(N / count)

# Calculate TF-IDF for each review
tfidf_list = []
for index, row in df.iterrows():
    tf_idf_doc = {}
    word_counts = defaultdict(int)
    for word in row['Final_Tokens']:
        word_counts[word] += 1

    for word, count in word_counts.items():
        tf = 1 + math.log(count + 1) # Using the specified TF formula
        tfidf = tf * idf_scores[word]
        tf_idf_doc[word] = tfidf
    tfidf_list.append(tf_idf_doc)

df['TF-IDF'] = tfidf_list



Columns available before TF-IDF calculation: Index(['Review_ID', 'Review_Text', 'Processed_Text', 'Tokens',
       'Tokens_NoStopwords', 'Stemmed_Tokens', 'Lemmatized_Tokens'],
      dtype='object')


In [9]:
print("Final DataFrame with Processed Text and TF-IDF scores:")
# Print the first few entries of TF-IDF for brevity
df_display = df[['Review_ID', 'Review_Text', 'Final_Tokens', 'TF-IDF']].copy()
df_display['TF-IDF'] = df_display['TF-IDF'].apply(lambda x: {k: round(v, 4) for k, v in list(x.items())[:5]}) # Show only top 5 TF-IDF for display
print(df_display)

Final DataFrame with Processed Text and TF-IDF scores:
    Review_ID                                        Review_Text  \
0           1  "The product is GREAT! Loved it, but it’s a bi...   
1           2  "Worst product ever!! Wouldn’t recommend to an...   
2           3  "Satisfactory quality, works as expected, no m...   
3           4  "Amazing product, I would buy it again and aga...   
4           5  "The delivery was slow, but the product is good."   
5           6  "Horrible experience, the product broke after ...   
6           7  "Great value for the price! Definitely worth b...   
7           8  "The product didn’t meet my expectations, retu...   
8           9  "I’m satisfied with the purchase, but there ar...   
9          10  "Superb product! Excellent build quality and g...   
10         11  "The product is just okay, nothing special, bu...   
11         12  "Fast delivery and product as described. Would...   
12         13  "Not worth the money. The product feels cheap 

## Summary of Key Outputs:

In [10]:
print("="*60)
print("SUMMARY OF TEXT PREPROCESSING AND TF-IDF ANALYSIS")
print("="*60)

# Display the Final DataFrame with preprocessed tokens
print("\nFinal DataFrame with Processed Tokens:")
print(df[['Review_ID', 'Review_Text', 'Final_Tokens']].head(10))
print("\n" + "="*50 + "\n")

# Display the TF-IDF scores for each review
# Limiting the display to a few key TF-IDF scores per review for readability
print("Final DataFrame with TF-IDF Scores (showing top 5 words per review):")
df_display = df[['Review_ID', 'Review_Text', 'Final_Tokens', 'TF-IDF']].copy()
df_display['TF-IDF'] = df_display['TF-IDF'].apply(lambda x: {k: round(v, 4) for k, v in list(x.items())[:5]})
print(df_display.head(10))

print("\n" + "="*60 + "\n")
print("Summary complete.")
print("="*60)

SUMMARY OF TEXT PREPROCESSING AND TF-IDF ANALYSIS

Final DataFrame with Processed Tokens:
   Review_ID                                        Review_Text  \
0          1  "The product is GREAT! Loved it, but it’s a bi...   
1          2  "Worst product ever!! Wouldn’t recommend to an...   
2          3  "Satisfactory quality, works as expected, no m...   
3          4  "Amazing product, I would buy it again and aga...   
4          5  "The delivery was slow, but the product is good."   
5          6  "Horrible experience, the product broke after ...   
6          7  "Great value for the price! Definitely worth b...   
7          8  "The product didn’t meet my expectations, retu...   
8          9  "I’m satisfied with the purchase, but there ar...   
9         10  "Superb product! Excellent build quality and g...   

                                        Final_Tokens  
0               [product, great, loved, bit, pricey]  
1  [worst, product, ever, wouldnt, recommend, any...  
2  [sat