In [None]:
!pip install openpyxl


In [None]:
import requests
import zipfile
import io
import os
import re
import spacy
import numpy as np
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
####    Genaral Functions   ####

# 1- Function for showing the dataframe characteristics

def data_details(df, n=5):

     print("\n")
     print("Shape:")
     print(df.shape)
     print("\n")
    
     print("\n The Head")
     display(df.head(n))  
     print("\n")

     print("\n Info:")
     print(df.info(memory_usage="deep"))
     print("\n") 

     print("\n The Null Values:")
     print(df.isnull().sum())
     print("\n") 

     print("\nSummary statistics (categorical):")  # As our needed features are categorical 
     categorical_cols = df.select_dtypes(include=[object]).columns
     if len(categorical_cols) > 0:
         display(df.describe(include=[object]))
     else:
         print("No categorical columns found.")
         print("\n")    

# 2- Function for cleaning the DataFrame (Normalization)

def clean_text(text, mode="input" ,lowercase=True):
                                 # input (features [ product_name , brand , category , subcatogry] or the target [description])
    if pd.isna(text):
        return ""
   
    if lowercase:
        text = text.lower()
     
    text = re.sub(r"<.*?>", " ", text) # Remove HTML tags
    text = re.sub(r"(https?://\S+|www\.\S+|ftp://\S+)", " ", text) # Remove URLs

    if mode == "description":
        text = re.sub(r"[^a-z0-9\s&-]", " ", text) # For descriptions: keep only letters, numbers, spaces, &, -
    
    else:  # mode == "input features"
        
        text = re.sub(r"&", " & ", text) # Add spaces around &     
        text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)  # Split camelCase or PascalCase 
        text = re.sub(r"(\||/|>)", " > ", text) # Normalize separators (|, /, >) to " > " as some product names contains | symbole 

        if lowercase: #
           text = re.sub(r"[^a-z0-9\s&'\->]", " ", text)  # Keep allowed chars: a-z, 0-9, spaces, &, -, >,'(lowercase for category & sub_catogry)
        else:
           text = re.sub(r"[^a-zA-Z0-9\s&'\->]", " ", text) # (uppercase for product_name & brand)

    text = re.sub(r"\s+", " ", text).strip()  # Normalize multiple spaces to single space
    text = re.sub(r"( > )+", " > ", text) # Normalize multiple > in a row  
    text = text.strip(" >") # Remove leading/trailing >

    return text

# 3- function to apply tokenization - lemmatization - stopword/punctuation removal
# and keep the original casing for product names and brand

def tokenize_lemmatize(text, product_name=None, brand=None):
    if not text:
        return []

    preserve_tokens = set() # a set containing all the parts (tokens) of product name and brand.
    if product_name:
        preserve_tokens.update(product_name.split())
    if brand:
        preserve_tokens.update(brand.split())

    doc = nlp(text)

    tokens = []
    for token in doc:
        if token.text in preserve_tokens:
            tokens.append(token.text)  # keep original casing
        elif not token.is_stop and token.is_alpha:
            tokens.append(token.lemma_)  # lemmatize normal words
    return tokens



# 4- function to apply the clean_text and tokenize_lemmatize on our dataframe (Data preprocessiong step) 
 
def preprocess_dataset_clean_only(df, for_model=False): 

    clean_df = pd.DataFrame() # To return a new DataFrame with only cleaned columns

    feature_cols = ["product_name", "brand"] # Clean feature columns
    for col in feature_cols:
        clean_df[f"clean_{col}"] = df[col].apply(lambda x: clean_text(x, mode="input",lowercase=False )) # keep upercase 

    feature_cols = [ "category", "sub_category"]  # Clean feature columns
    for col in feature_cols:
        clean_df[f"clean_{col}"] = df[col].apply(lambda x: clean_text(x, mode="input",lowercase=True )) # convert to lowercase 
    
      
    clean_df["clean_description"] = df["description"].apply(lambda x: clean_text(x, mode="description", lowercase=True )) # Clean description

#  Replace lowercase product/brand mentions with their original casing that is mentioned in description
    for i, row in clean_df.iterrows():
         desc = clean_df.at[i, "clean_description"]

      # Handle product name parts
         product_name = row["clean_product_name"]
         if product_name:
             for token in product_name.split():
                 pattern = r"\b" + re.escape(token.lower()) + r"\b"
                 desc = re.sub(pattern, token, desc)

     # Handle brand
         brand = row["clean_brand"]
         if brand:
             for token in brand.split():
                 pattern = r"\b" + re.escape(token.lower()) + r"\b"
                 desc = re.sub(pattern, token, desc)

         clean_df.at[i, "clean_description"] = desc
    
    clean_df = clean_df.drop_duplicates(subset=["clean_description"])  # Drop duplicates and empty descriptions
    clean_df = clean_df[clean_df["clean_description"] != ""].reset_index(drop=True)

    # If preparing for model... add tokenization + lemmatization

    if for_model:
        clean_df["description_tokens"] = clean_df.apply(
            lambda row: tokenize_lemmatize(row["clean_description"], 
                                           row["clean_product_name"], 
                                           row["clean_brand"]), axis=1)
        clean_df = clean_df.drop(columns=["clean_description"])
        
    return clean_df 

In [None]:
### Download Data From Github

zip_url = 'https://raw.githubusercontent.com/Eng-Shady-Hub/Generative_AI_Project_Round3/refs/heads/main/All_Datasets2.zip'
response = requests.get(zip_url)
response.raise_for_status()

In [None]:
dataframes = {}

# Open the ZIP file from memory
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # Collect only CSV files
    csv_files = [f for f in z.namelist() if f.lower().endswith(".csv")]

    if not csv_files:
        print("No CSV files found in the ZIP.")
    else:
        for i, file_name in enumerate(csv_files, start=1):
            key = f"df{i}"
            try:
                with z.open(file_name) as f:
                    # Try UTF-8 first; fallback to latin1 if decoding fails
                    try:
                        dataframes[key] = pd.read_csv(f, encoding='utf-8')
                    except UnicodeDecodeError:
                        f.seek(0)
                        dataframes[key] = pd.read_csv(f, encoding='latin1')

                    print(f'DataFrame "{key}" created from file: {file_name} (shape: {dataframes[key].shape})')

            except Exception as e:
                print(f"Error reading {file_name}: {e}")

In [None]:
# DataFrame 1

basket_data = dataframes["df1"]
data_details(basket_data)

In [None]:
# target = [description]
# features = [product_name ,brand ,category, sub_category]

basket_data = basket_data[["product","brand","category" , "sub_category","description"]]
basket_data= basket_data.rename(columns={"product": "product_name"})

print(basket_data.columns)

In [None]:
# there are null values in product_name , brand & description columns

basket_data=basket_data.dropna(subset=["description","product_name","brand"])
data_details(basket_data)

In [None]:
# DataFrame 2

adidas_data = dataframes["df2"]
data_details(adidas_data)

In [None]:
# Unify columns names
adidas_data = adidas_data[["Product Name", "Brand", "Description"]].rename(columns={"Product Name": "product_name", "Brand": "brand" , "Description":"description"})
adidas_data.info()

# there are null values(only 3) in description column

adidas_data=adidas_data.dropna(subset=["description"])
adidas_data =adidas_data[adidas_data['description'] != 'No description']
adidas_data.isnull().sum()

In [None]:
# Dataset has agood description to our model but not have category & sub_category columns
# So we map category & sub_category columns with respect to the product_name column

category_map = {
    # cayrgory = footwear 
  "shoe": ("footwear", "shoes"),"sneaker": ("footwear", "shoes"),"running": ("footwear", "running shoes"),"trainer": ("footwear", "trainers"),"cleat": ("footwear", "cleats"),
    "slipper": ("footwear", "slippers"),"flip flop": ("footwear", "flip flops"),"jordan": ("footwear", "basketball shoes"),"retro": ("footwear", "shoes"),
    "phantom": ("footwear", "cleats"),"venom": ("footwear", "cleats"),"mercurial": ("footwear", "soccer shoes"),"superfly": ("footwear", "soccer shoes"),
    "tf": ("footwear", "turf soccer shoes"),"air max": ("footwear", "sneakers"),"p-6000": ("footwear", "running shoes"),"sandal": ("footwear", "sandals"),
    "slide": ("footwear", "slides"),"adilette": ("footwear", "slides"),"flipflop": ("footwear", "flip flops"),"sb": ("footwear", "skate shoes"),"skate": ("footwear", "skate shoes"),
    "chron": ("footwear", "skate shoes"),"kd": ("footwear", "basketball shoes"),"kyrie": ("footwear", "basketball shoes"),"iconclash": ("footwear", "running shoes"),
    "daybreak": ("footwear", "sneakers"),"blazer": ("footwear", "sneakers"),"prelove": ("footwear", "sneakers"),"pegasus": ("footwear", "running shoes"),
    "vaporfly": ("footwear", "running shoes"),"zoomx": ("footwear", "running shoes"),"slipon": ("footwear", "slip-ons"),"airforce": ("footwear", "sneakers"),
    "airmax": ("footwear", "sneakers"),"metcon": ("footwear", "training shoes"),"court": ("footwear", "tennis shoes"),"pg": ("footwear", "basketball shoes"),
    "m2k": ("footwear", "sneakers"),"winflo": ("footwear", "running shoes"),"vomero": ("footwear", "running shoes"),"vapormax": ("footwear", "lifestyle sneakers"),
    "flip-flop": ("footwear", "flip flops"),"flip-flops": ("footwear", "flip flops"),"slip-on": ("footwear", "slip-ons"), "slip-ons": ("footwear", "slip-ons"),
    "odyssey react": ("footwear", "running shoes"),"legend react": ("footwear", "running shoes"),"pre-love": ("footwear", "sneakers"),"air force": ("footwear", "sneakers"),
    "drop-type": ("footwear", "running shoes"),"zoom rival fly": ("footwear", "running shoes"),"mx-720-818": ("footwear", "running shoes"),"tanjun": ("footwear", "running shoes"),
    "superstar": ("footwear", "sneakers"),"slip on": ("footwear", "slip-ons"),"lebron soldier": ("footwear", "basketball shoes"),"react element": ("footwear", "running shoes"),
    "free rn": ("footwear", "running shoes"),"zoom fly": ("footwear", "running shoes"),"zoom rise": ("footwear", "running shoes"),"tiempo legend": ("footwear", "soccer shoes"),
    "flex rn": ("footwear", "running shoes"),"air zoom structure": ("footwear", "running shoes"),"sfb gen 2": ("footwear", "boots"),"air huarache": ("footwear", "sneakers"),
    "wildhorse": ("footwear", "running shoes"),"benassi": ("footwear", "slides"),"terra kiger": ("footwear", "running shoes"),"classic cortez": ("footwear", "sneakers"),
    "renew run": ("footwear", "running shoes"),"free tr": ("footwear", "training shoes"),"lebron": ("footwear", "basketball shoes"),"mowabb": ("footwear", "sneakers"),
    "revolution": ("footwear", "running shoes"),"precision": ("footwear", "basketball shoes"),"shox": ("footwear", "running shoes"),"potential": ("footwear", "basketball shoes"),
    "epic react": ("footwear", "running shoes"), "react city": ("footwear", "running shoes"),"kawa": ("footwear", "slides"),"joyride run": ("footwear", "running shoes"),
    "joyride optik": ("footwear", "running shoes"),"flex contact": ("footwear", "running shoes"),"football": ("footwear", "Football Shoes"),"predator": ("footwear", "Football Shoes"),
    "vandalised": ("footwear", "Casual Shoes"),"canyon": ("footwear", "Casual Shoes"),"react": ("footwear", "Running Shoes"),"acg": ("footwear", "Outdoor Shoes"),
    "flex": ("footwear", "Training Shoes"),"signal": ("footwear", "Running Shoes"),"joyride": ("footwear", "Running Shoes"),"cortez": ("footwear", "Casual Shoes"),
    "hawkins": ("footwear", "Casual Shoes"),"nemeziz": ("footwear", "Football Shoes"),"indoor": ("footwear", "Indoor Shoes"),"outdoor": ("footwear", "Outdoor Shoes"),
    "trail": ("footwear", "Outdoor Shoes"),"superrep": ("footwear", "Training Shoes"),"zoom": ("footwear", "Running Shoes"),"tr": ("footwear", "Training Shoes"),
    "renew": ("footwear", "Running Shoes"),"ghost": ("footwear", "Running Shoes"),"racer": ("footwear", "Running Shoes"),"alphadunk": ("footwear", "Basketball Shoes"),
    "monarch": ("footwear", "Walking Shoes"),"af-1": ("footwear", "Casual Shoes"),"bella": ("footwear", "Casual Shoes"), "huarache": ("footwear", "Lifestyle Shoes"),
    "solarsoft": ("footwear", "Training Shoes"),"exp-x14": ("footwear", "Running Shoes"),"fly.by": ("footwear", "Basketball Shoes"),"xarr": ("footwear", "Training Shoes"),
    "skarn": ("footwear", "Casual Shoes"),"tailwind": ("footwear", "Running Shoes"), "air dsvm": ("footwear", "Running Shoes"),
    # category = accessories
    "sock": ("accessories", "socks"), "cap": ("accessories", "cap"),"hat": ("accessories", "cap"),"bag": ("accessories", "bag"),"backpack": ("accessories", "bag"),
    "watch": ("accessories", "watch")
    }
    
def categorize_product(name):
    name = str(name).lower()
    for keyword, (cat, subcat) in category_map.items():
        if keyword in name:
            return cat, subcat
    return "Other", "Other"  # fallback if no keyword found

adidas_data[["category", "sub_category"]] = adidas_data["product_name"].apply(lambda x: pd.Series(categorize_product(x)))

In [None]:
# arranging the columns to be the same in all datasets

adidas_data = adidas_data[["product_name", "brand","category", "sub_category", "description"]]
print(adidas_data.columns)
data_details(adidas_data , n=20)

In [None]:
# DataFrame 3

amazon_data =dataframes["df3"] 
data_details(amazon_data)

In [None]:
# there is no null values in the prefered dataset features
#  Amazon dataset don't contain brand , we note the first name in the product_name is the brand
# So creating a function to map the brand column with respect to product_name column

def map_brand(name):
    return name.split()[0]

# Apply function
amazon_data['brand'] =amazon_data['product_name'].apply(map_brand)

#  Amazon dataset don't contain sub_category , we note the values in category colums are diveded by | 
# So creating it by map sub_category column with respect to category column by extracting the most specific level(last part)

amazon_data['sub_category'] = amazon_data['category'].apply(lambda x: x.split('|')[-1])

amazon_data.head()

In [None]:
# arranging the columns to be the same in all datasets

amazon_data = amazon_data[["product_name", "brand","category", "sub_category", "about_product"]]
amazon_data= amazon_data.rename(columns={"about_product": "description"})
print(amazon_data.columns)
data_details(amazon_data)

In [None]:
# DataFrame 4
 
flipkart_data = dataframes["df4"]
data_details(flipkart_data)

In [None]:
# there are nulls in description an brand columns
# clearing "discription" rows with missed values
 
flipkart_data=flipkart_data.dropna(subset=["description"])
flipkart_data =flipkart_data[flipkart_data['description'] != 'No description']
flipkart_data.isnull().sum()



In [None]:
# display the most common brand to fill the missing value 

most_common = flipkart_data['brand'].mode()[0]
print(most_common)

In [None]:
# filling the missed value of brand By common brand in our dataset "REEB"

flipkart_data['brand'].fillna("REEB", inplace=True)

In [None]:
# arranging the columns to be the same in all datasets

flipkart_data = flipkart_data[["title", "brand","category", "sub_category", "description"]]
flipkart_data= flipkart_data.rename(columns={"title": "product_name"})
print(flipkart_data.columns)
flipkart_data.isnull().sum()

In [None]:
# DataFrame 5

adidas2_data =dataframes["df5"] 
data_details(adidas2_data)

In [None]:
# adidas2 dataset not have null values
#  Noting the breadcrumbs colums contains sub_category

adidas2_data = adidas2_data[["name", "brand","category", "breadcrumbs", "description"]]
adidas2_data= adidas2_data.rename(columns={"name": "product_name" , "breadcrumbs":"sub_category"})
print(adidas2_data.columns)
data_details(adidas2_data)

In [None]:
# DataFrame 6

elec_data = dataframes["df6"]
data_details(elec_data)


In [None]:
# The elec_data dataSet is clear 

elec_data= elec_data.rename(columns={"Product_name": "product_name"})
print(elec_data.columns)

In [None]:
# DataFrame 7

Bigbasket2 = dataframes["df7"]
data_details(Bigbasket2)

In [None]:
Bigbasket2=Bigbasket2[["SKU Name","Brand","Category","Sub-Category","About the Product"]]
data_details(Bigbasket2)

In [None]:
# rename the column and clean the row with null or no description

Bigbasket2=Bigbasket2.rename(columns={"SKU Name": "product_name" , "Brand":"brand","Category":"category", "Sub-Category":"sub_category" ,"About the Product":"description"})
Bigbasket2=Bigbasket2.dropna(subset=["description"])
Bigbasket2 =Bigbasket2[Bigbasket2['description'] != 'No description']
Bigbasket2.isnull().sum()

In [None]:
# the brand column has only 3 null values

Bigbasket2=Bigbasket2.dropna(subset=["brand"])

In [None]:
data_details(Bigbasket2)

In [None]:
# Combining all datasets

Final_data = pd.concat([basket_data, adidas_data, amazon_data ,adidas2_data , flipkart_data , elec_data  ,Bigbasket2], ignore_index=True)

In [None]:
# Combining all datasets

Final_data = pd.concat([basket_data, adidas_data, amazon_data ,adidas2_data , flipkart_data , elec_data  ,Bigbasket2], ignore_index=True)

In [None]:
data_details(Final_data)

In [None]:
save_path = os.path.expanduser("~/Documents/Final_data.csv")
Final_data.to_csv(save_path, index=False, encoding="utf-8")
print("Dataset Saved to : ", save_path)

In [None]:
# Just clean (ready for Fine-Tune Pretrained Model)
clean_Final_data = preprocess_dataset_clean_only(Final_data, for_model=False)
data_details(clean_Final_data)

In [None]:
save_path = os.path.expanduser("~/Documents/clean_Final_data.csv")
clean_Final_data.to_csv(save_path, index=False, encoding="utf-8")
print("Dataset Saved to : ", save_path)

In [None]:
# Clean + tokens (ready for model from scratch)

clean_Final_data_model = preprocess_dataset_clean_only(Final_data, for_model=True)

In [None]:
clean_Final_data_model.head(10)