In [12]:
!pip install nltk wordcloud

import json
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#1.1
# Function to clean and normalize text fields
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+', '', text) # Delete no ASCII character
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces
    tokens = text.split() # Tokenize
    tokens = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens] # Apply stemming

    return ' '.join(stemmed)


In [14]:
#1.2
with open('fashion_products_dataset.json', 'r', encoding='utf-8') as f:
    corpus = json.load(f)
# Text cleaning to title and description fields
for doc in corpus:
    doc['title_clean'] = clean_text(doc.get('title', ''))
    doc['description_clean'] = clean_text(doc.get('description', ''))

REQUIRED_FIELDS = [
    'pid', 'title', 'description', 'brand', 'category', 'sub_category',
    'product_details', 'seller', 'out_of_stock', 'selling_price',
    'discount', 'actual_price', 'average_rating', 'url'
]
# We ensure all required fields are present in each document
def ensure_fields(doc):
    for field in REQUIRED_FIELDS:
        if field not in doc:
            doc[field] = None
    return doc

# Apply field completion to the entire corpus
corpus = [ensure_fields(doc) for doc in corpus]



In [15]:
#1.3
def normalize_numeric_fields(doc):
    # Convert price and rating fields to numeric types
    try:
        doc['selling_price'] = float(doc['selling_price'].replace(',', '.'))
    except:
        doc['selling_price'] = None
    try:
        doc['actual_price'] = float(doc['actual_price'].replace(',', '.'))
    except:
        doc['actual_price'] = None
    try:
        doc['discount'] = int(doc['discount'].replace('% off', '').strip())
    except:
        doc['discount'] = None
    try:
        doc['average_rating'] = float(doc['average_rating'])
    except:
        doc['average_rating'] = None
    '''
    # Ensure out_of_stock is boolean
    if isinstance(doc.get('out_of_stock'), str):
        doc['out_of_stock'] = doc['out_of_stock'].lower() == 'true'
    elif not isinstance(doc.get('out_of_stock'), bool):
        doc['out_of_stock'] = None
    '''
    return doc

# Apply normalization to the entire corpus
corpus = [normalize_numeric_fields(doc) for doc in corpus]
