In [13]:
!pip install nltk
import json
import re
import nltk
import math 
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\caste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#1.1
# Function to clean and normalize text fields
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+', '', text) # Delete no ASCII character
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces
    tokens = text.split() # Tokenize
    tokens = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens] # Apply stemming

    return ' '.join(stemmed)


In [15]:
#1.2
with open('fashion_products_dataset.json', 'r', encoding='utf-8') as f:
    corpus = json.load(f)
# Text cleaning to title and description fields
for doc in corpus:
    doc['title_clean'] = clean_text(doc.get('title', ''))
    doc['description_clean'] = clean_text(doc.get('description', ''))

REQUIRED_FIELDS = [
    'pid', 'title', 'description', 'brand', 'category', 'sub_category',
    'product_details', 'seller', 'out_of_stock', 'selling_price',
    'discount', 'actual_price', 'average_rating', 'url'
]
# We ensure all required fields are present in each document
def ensure_fields(doc):
    for field in REQUIRED_FIELDS:
        if field not in doc:
            doc[field] = None
    return doc

# Apply field completion to the entire corpus
corpus = [ensure_fields(doc) for doc in corpus]



In [16]:
# 1.3 Creation of a new variable that aggrups multiple features
def build_metadata_text(doc):
    brand = doc.get('brand', '')
    category = doc.get('category', '')
    sub_category = doc.get('sub_category', '')
    seller = doc.get('seller', '')
    product_details = ' '.join(
        f"{k} {v}" for d in doc.get('product_details', []) for k, v in d.items()
    )
    return f"{brand} {category} {sub_category} {product_details} {seller}".lower()

for doc in corpus:
    doc['metadata_text'] = build_metadata_text(doc)

In [17]:
#1.4
def normalize_numeric_fields(doc):
    # Convert price and rating fields to numeric types
    try:
        doc['selling_price'] = float(doc['selling_price'].replace(',', '.'))
    except:
        doc['selling_price'] = None
    try:
        doc['actual_price'] = float(doc['actual_price'].replace(',', '.'))
    except:
        doc['actual_price'] = None
    try:
        doc['discount'] = int(doc['discount'].replace('% off', '').strip())
    except:
        doc['discount'] = None
    try:
        doc['average_rating'] = float(doc['average_rating'])
    except:
        doc['average_rating'] = None

    return doc

# Apply normalization to the entire corpus
corpus = [normalize_numeric_fields(doc) for doc in corpus]


**DELIVERABLE 2**

In [18]:
# 1.1 Build inverted index
inverted_index = {}

# Iterate through each document in the corpus
for doc in corpus:
    doc_id = doc['pid']  # Use product ID as the document ID
    # Combine cleaned title and description
    cleaned_text = doc.get('title_clean', '') + ' ' + doc.get('description_clean', '')
    # Split the cleaned text into terms
    terms = cleaned_text.split()

    # Add terms and document ID to the inverted index
    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = []
        if doc_id not in inverted_index[term]:
            inverted_index[term].append(doc_id)

In [49]:

def query(query_terms, inverted_index):
    # Return documents that contain at least one query term
    matching_docs = set()
    for term in query_terms:
        if term in inverted_index:
            matching_docs.update(inverted_index[term])
    return list(matching_docs)



In [20]:
# Retrieve and print the documents that match the query
def print_matching_documents(features):
  print("\nMatching documents with features", features, ":")
  matching_doc_ids = query(features, inverted_index)
  for docid in matching_doc_ids:
      # Find the document in the corpus list that matches the docid (pid)
      for doc in corpus:
          if doc.get('pid') == docid:
              print(doc)
              break # Found the document, move to the next docid
  if not matching_doc_ids:
      print("No matching documents found.")
#Example queries:
features1 = ['jean','red','jacket']
features2 = ['pink','stripe','shirt']
features3 = ['sock','stripe','grey']
features4 = ['sport','shoe','red']
features5 = ['blue','red','yellow']
print_matching_documents(features1)
print_matching_documents(features2)
print_matching_documents(features3)
print_matching_documents(features4)
print_matching_documents(features5)


Matching documents with features ['jean', 'red', 'jacket'] :
{'_id': 'f1ce2f9e-ba05-507c-9bc2-136717d8e249', 'actual_price': 1.398, 'average_rating': 3.4, 'brand': 'ModeWa', 'category': 'Clothing and Accessories', 'crawled_at': 1612991478000, 'description': 'Mark your solid impressions with ours ultra-soft, t-shirt. Wear this t-shirt. Place in many ways with dark jeans, cuffed chinos. Its 100% soft cotton, solid color. Machine wash t-shirt, the material moves in every direction. Looks cool in every type of jacket so you look cool in winters as well. Its super dry and ultra-fashionable hooded must have t-shirt in your wardrobe.', 'discount': 64, 'images': ['https://rukminim1.flixcart.com/image/128/128/kdhphu80/t-shirt/6/i/t/xl-yt-tshrt-135-ytrick-original-imafudu6ehngehfk.jpeg?q=70', 'https://rukminim1.flixcart.com/image/128/128/kdhphu80-0/t-shirt/7/z/0/l-yt-tshrt-48-ytrick-original-imafuduyhvmgzy3z.jpeg?q=70', 'https://rukminim1.flixcart.com/image/128/128/kdhphu80-0/t-shirt/b/h/d/m-yt

In [None]:
from collections import Counter
import math
#1.3
# Calculate TF for each term in each document
tf_scores = {}
for doc in corpus:
    doc_id = doc['pid']
    cleaned_text = doc.get('title_clean', '') + ' ' + doc.get('description_clean', '')
    terms = cleaned_text.split()
    term_counts = Counter(terms)
    total_terms = len(terms)
    tf_scores[doc_id] = {term: count / total_terms for term, count in term_counts.items()}

#Calculate IDF for each term in the corpus
idf_scores = {}
total_documents = len(corpus)
all_terms = set(term for doc_id, doc_tf in tf_scores.items() for term in doc_tf.keys())

for term in all_terms:
    doc_count = sum(1 for doc in corpus if term in (doc.get('title_clean', '') + ' ' + doc.get('description_clean', '')).split())
    idf_scores[term] = math.log(total_documents / doc_count)

#Calculate TF-IDF scores
tfidf_scores = {}
for doc_id, doc_tf in tf_scores.items():
    tfidf_scores[doc_id] = {term: tf * idf_scores.get(term, 0) for term, tf in doc_tf.items()}

#Function to rank documents based on TF-IDF scores for a given query
def rank_documents_tfidf(query_terms, inverted_index, tfidf_scores):
    """
    Ranks documents based on the sum of TF-IDF scores of the query terms.

    Args:
        query_terms (list): A list of terms (words) to query.
        inverted_index (dict): The inverted index mapping terms to document IDs.
        tfidf_scores (dict): A dictionary of TF-IDF scores for each term in each document.

    Returns:
        list: A list of document IDs sorted by their relevance (sum of TF-IDF scores).
    """
    # Get the initial set of matching documents using the conjunctive query
    matching_doc_ids = query(query_terms, inverted_index)

    # Calculate relevance score for each matching document
    relevance_scores = {}
    for doc_id in matching_doc_ids:
        score = sum(tfidf_scores[doc_id].get(term, 0) for term in query_terms)
        relevance_scores[doc_id] = score

    # Sort documents by relevance score in descending order
    ranked_docs = sorted(relevance_scores.items(), key=lambda item: item[1], reverse=True)

    # Return both the ranked document IDs and the relevance scores
    return [doc_id for doc_id, score in ranked_docs], relevance_scores

# Function to print ranked documents
def print_ranked_documents(features, inverted_index, tfidf_scores, corpus, top_k=10):
    print("\nRanked documents for features", features, ":")
    ranked_results, relevance_scores = rank_documents_tfidf(features, inverted_index, tfidf_scores)
    if ranked_results:
        # Only keep top_k results
        for docid in ranked_results[:top_k]:
            for doc in corpus:
                if doc.get('pid') == docid:
                    print(f"Doc ID: {docid}, Score: {relevance_scores[docid]:.4f}, Title: {doc.get('title', 'N/A')}")
                    break
    else:
        print("No matching documents found.")

In [None]:
# Example usage with the previous queries:
features_obl1 = ['woman', 'full', 'sleeve', 'sweatshirt', 'cotton']
features_obl2 = [ 'slim', 'jeans', 'blue']
features1 = ['jean','red','jacket']
features2 = ['pink','stripe','shirt']
features3 = ['sock','stripe','grey']
features4 = ['sport','shoe','red']
features5 = ['blue','red','yellow']

print_ranked_documents(features_obl1, inverted_index, tfidf_scores, corpus)
print_ranked_documents(features_obl2, inverted_index, tfidf_scores, corpus)
print_ranked_documents(features1, inverted_index, tfidf_scores, corpus)
print_ranked_documents(features2, inverted_index, tfidf_scores, corpus)
print_ranked_documents(features3, inverted_index, tfidf_scores, corpus)
print_ranked_documents(features4, inverted_index, tfidf_scores, corpus)
print_ranked_documents(features5, inverted_index, tfidf_scores, corpus)


Ranked documents for features ['woman', 'full', 'sleeve', 'sweatshirt', 'cotton'] :
Doc ID: SWSFZVTNGM4HG8BC, Score: 1.3451, Title: Full Sleeve Printed Women Sweatshirt
Doc ID: SWSFZVTTQCB4SJ7F, Score: 1.3451, Title: Full Sleeve Solid Women Sweatshirt
Doc ID: SWSFZZ3XH5PG9VZ4, Score: 1.3451, Title: Full Sleeve Solid Women Sweatshirt
Doc ID: SWSFGNK2VZBAM7GG, Score: 1.2712, Title: Full Sleeve Embroidered Men Sweatshirt
Doc ID: SWSFJFV4ZKWUX2FP, Score: 1.2712, Title: Full Sleeve Solid Men Sweatshirt
Doc ID: SWSFTEMFY8PWDWJQ, Score: 1.0170, Title: Full Sleeve Solid Women Sweatshirt
Doc ID: SWSFUY89VCRXH8UD, Score: 1.0170, Title: Full Sleeve Printed Women Sweatshirt
Doc ID: SWSF9W4TBFMF3GZA, Score: 1.0170, Title: Full Sleeve Striped Women Sweatshirt
Doc ID: SWSFGRFGWRX8HZBQ, Score: 1.0170, Title: Full Sleeve Solid Women Sweatshirt
Doc ID: SWSF8922HHDGB7CH, Score: 1.0170, Title: Full Sleeve Striped Men Sweatshirt

Ranked documents for features ['slim', 'jeans', 'blue'] :


TypeError: 'NoneType' object does not support item assignment

In [None]:
# PART 2 
validation = pd.read_csv("validation_labels.csv")
validation.columns = [c.lower() for c in validation.columns]
validation['doc_id'] = validation['doc_id'].astype(str)
validation['query'] = validation['query'].astype(str)

ranked_results = {
    "1": ["PID123", "PID998", "PID321", "PID650"],  # women full sleeve sweatshirt cotton
    "2": ["PID500", "PID444", "PID321"],            # men slim jeans 

    "3": ["PID100", "PID200", "PID300"],
    "4": ["PID400", "PID500"],
    "5": ["PID600", "PID700"],
    "6": ["PID888", "PID777"],
    "7": ["PID910", "PID920", "PID930"],
}

for q_id, docs in ranked_results.items():
    evaluate_query(q_id, docs)


KeyError: 'doc_id'

In [None]:
# 2.1
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return sum(1 for doc in retrieved_k if doc in relevant) / k


def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return sum(1 for doc in retrieved_k if doc in relevant) / len(relevant)


def average_precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    score = 0
    rel_count = 0

    for i, doc in enumerate(retrieved_k, start=1):
        if doc in relevant:
            rel_count += 1
            score += rel_count / i

    return score / len(relevant) if len(relevant) > 0 else 0


def f1_at_k(retrieved, relevant, k):
    p = precision_at_k(retrieved, relevant, k)
    r = recall_at_k(retrieved, relevant, k)
    return (2 * p * r) / (p + r) if (p + r) > 0 else 0


def reciprocal_rank(retrieved, relevant):
    for i, doc in enumerate(retrieved, start=1):
        if doc in relevant:
            return 1 / i
    return 0


def dcg_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    dcg = 0
    for i, doc in enumerate(retrieved_k, start=1):
        rel = 1 if doc in relevant else 0
        dcg += rel / math.log2(i + 1)
    return dcg


def ndcg_at_k(retrieved, relevant, k):
    ideal = [1] * len(relevant)
    idcg = sum(rel / math.log2(i + 1) for i, rel in enumerate(ideal[:k], start=1))
    return dcg_at_k(retrieved, relevant, k) / idcg if idcg > 0 else 0


In [None]:
def evaluate_query(query_id, retrieved_docs, K=[1, 3, 5, 10]):
    relevant_docs = validation[validation["query"] == str(query_id)]
    relevant_docs = relevant_docs[relevant_docs["relevance"] == 1]["doc_id"].tolist()

    print(f"\n===== Query {query_id} Evaluation =====")
    print(f"Relevant documents: {relevant_docs}")

    for k in K:
        P = precision_at_k(retrieved_docs, relevant_docs, k)
        R = recall_at_k(retrieved_docs, relevant_docs, k)
        AP = average_precision_at_k(retrieved_docs, relevant_docs, k)
        F1 = f1_at_k(retrieved_docs, relevant_docs, k)
        NDCG = ndcg_at_k(retrieved_docs, relevant_docs, k)

        print(f"@{k} →  P: {P:.3f}   R: {R:.3f}   AP: {AP:.3f}   F1: {F1:.3f}   NDCG: {NDCG:.3f}")

    RR = reciprocal_rank(retrieved_docs, relevant_docs)
    print(f"MRR: {RR:.3f}")