In [None]:
!pip install openpyxl


In [None]:
import requests
import zipfile
import io
import os
import numpy as np
import pandas as pd

In [None]:
import requests
import io
import zipfile
import pandas as pd     

def data_download(url):
    # Download file
    r = requests.get(url)
    archive = io.BytesIO(r.content)

    # Detect by extension first
    if url.endswith(".csv"):
        return pd.read_csv(archive)
    elif url.endswith(".json"):
        return pd.read_json(archive)
    elif url.endswith(".xlsx"):
        return pd.read_excel(archive)

    # Otherwise, maybe it's a ZIP ---- look inside for CSV/JSON
    if zipfile.is_zipfile(archive):
        with zipfile.ZipFile(archive, 'r') as z:
            print("Files in zip:", z.namelist())
            for filename in z.namelist():
                with z.open(filename) as f:
                    if filename.endswith(".csv"):
                        return pd.read_csv(f)
                    elif filename.endswith(".json"):
                        return pd.read_json(f)
        raise ValueError("No CSV or JSON file found in the zip archive.")

    # If none matched
    raise ValueError("Unsupported file format or not recognized.")

# Creating function for data details
def data_details(df, n=5):

     print("\n")
     print("Shape:")
     print(df.shape)
     print("\n")
    
     print("\n The Head")
     display(df.head(n))  
     print("\n")

     print("\n Info:")
     print(df.info(memory_usage="deep"))
     print("\n") 

     print("\nSummary statistics (categorical):")
     categorical_cols = df.select_dtypes(include=[object]).columns
     if len(categorical_cols) > 0:
         display(df.describe(include=[object]))
     else:
         print("No categorical columns found.")
         print("\n")    
    


def normalize_all(df):
    # normalize column names
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(r"[\s\-]+", "_", regex=True)   # spaces/hyphens -> underscore
        .str.replace(r"[^\w_]", "", regex=True)     # remove punctuation
    )

    # normalize values inside object (string) columns
    for col in df.select_dtypes(include=["object"]):
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.lower()
            .str.replace(r"\s+", " ", regex=True)
            .str.replace(r"[^\w_]", "", regex=True)     # remove punctuation
        )
    return df    

In [None]:
## 1- Loading Basket Dataset from kaggle
basket_data=data_download("https://storage.googleapis.com/kaggle-data-sets/2288739/3846442/compressed/BigBasket%20Products.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250912%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250912T120535Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=6e5a7be6ab6f1fa07424e8df3fa8d147f7350d8085eb529b7b08ed41542dac4ce5028d272c371691bd34ffd98efea8b1925110fd4c313816656bdf99a052d825b10fd60dba41e31544bccd7046cec58e39867c1d60f409c51f9346f9f0155eec58c6bfd98372213cfab070befb3ae6f8b3ce2d98889b5bfe0c78fc8f800e599c82bfa21763d89f4bdc7fc8dc313e7b4ac6e7f4d19e1de0607c1ee084cfd9c770d37749cd7262364c9f8fab3dc12755238efd9f20707840bd3cb185aaddd646b3d8c9097ee8a75edba3fe1a3615fd0a4361d5e704909b378e1609d594722ed7bdd8c1318963e865d9902c95be4f0ffeff92072a6970f8cdf188951118749d4867")
data_details(basket_data)

In [None]:
# target = [description]
# features = [product_name ,brand ,category, sub_category]

basket_data = basket_data[["product","brand","category" , "sub_category","description"]]
basket_data= basket_data.rename(columns={"product": "product_name"})

print(basket_data.columns)
basket_data.info()


In [None]:
# there are null values in product_name , brand & description columns
basket_data=basket_data.dropna(subset=["description","product_name","brand"])
basket_data.info()
basket_data.isnull().sum()

In [None]:
# 2-Loading Adidas Dataset from Opendatabay
adidas_data=data_download(url = "https://storage.googleapis.com/kaggle-data-sets/926008/1567101/compressed/Adidas%20Vs%20Nike.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250912%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250912T164849Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=378cdb431c332dca93c920cbfb664a1f62485d820c228b8c741560e3dbc745ec8b7d9c15563c68d8d726c453ca1c9591efb8daff8f533bbc7e8f263541442f9e3a25ac2dfe3d7d10cb106895f3df726e125f1eb1e7185c50b4950ca1f740d9ec3f76aaea77d9c963aaa117917dc15992a70f01e93d7b74b35d2f80e5a5e975f91b59452ff6d356f1bc9da0982beb626da6ecee09113eb205cead59651412988dbd4aa7c744c93895d1c4571f21cfd71c3948206a7eb8011568f98458331954356fcf9dd5f1418fbd21a64585fa1c5c2f47ecaa8f564cba3a06132ce240c237e793bf35c9fd8f37575842109e1a26afc3069a01b4d2416b5810e85016a17854b2")
data_details(adidas_data)

In [None]:
adidas_data = adidas_data[["Product Name", "Brand", "Description"]].rename(columns={"Product Name": "product_name", "Brand": "brand" , "Description":"description"})
adidas_data.info()

In [None]:
# there are null values(only 3) in description column
adidas_data=adidas_data.dropna(subset=["description"])
adidas_data =adidas_data[adidas_data['description'] != 'No description']

adidas_data.info()

In [None]:
# Dataset has agood description to our model but not have category & sub_category columns
# So we map category & sub_category columns with respect to the product_name column

category_map = {
    # cayrgory = footwear 
  "shoe": ("footwear", "shoes"),"sneaker": ("footwear", "shoes"),"running": ("footwear", "running shoes"),"trainer": ("footwear", "trainers"),"cleat": ("footwear", "cleats"),
    "slipper": ("footwear", "slippers"),"flip flop": ("footwear", "flip flops"),"jordan": ("footwear", "basketball shoes"),"retro": ("footwear", "shoes"),
    "phantom": ("footwear", "cleats"),"venom": ("footwear", "cleats"),"mercurial": ("footwear", "soccer shoes"),"superfly": ("footwear", "soccer shoes"),
    "tf": ("footwear", "turf soccer shoes"),"air max": ("footwear", "sneakers"),"p-6000": ("footwear", "running shoes"),"sandal": ("footwear", "sandals"),
    "slide": ("footwear", "slides"),"adilette": ("footwear", "slides"),"flipflop": ("footwear", "flip flops"),"sb": ("footwear", "skate shoes"),"skate": ("footwear", "skate shoes"),
    "chron": ("footwear", "skate shoes"),"kd": ("footwear", "basketball shoes"),"kyrie": ("footwear", "basketball shoes"),"iconclash": ("footwear", "running shoes"),
    "daybreak": ("footwear", "sneakers"),"blazer": ("footwear", "sneakers"),"prelove": ("footwear", "sneakers"),"pegasus": ("footwear", "running shoes"),
    "vaporfly": ("footwear", "running shoes"),"zoomx": ("footwear", "running shoes"),"slipon": ("footwear", "slip-ons"),"airforce": ("footwear", "sneakers"),
    "airmax": ("footwear", "sneakers"),"metcon": ("footwear", "training shoes"),"court": ("footwear", "tennis shoes"),"pg": ("footwear", "basketball shoes"),
    "m2k": ("footwear", "sneakers"),"winflo": ("footwear", "running shoes"),"vomero": ("footwear", "running shoes"),"vapormax": ("footwear", "lifestyle sneakers"),
    "flip-flop": ("footwear", "flip flops"),"flip-flops": ("footwear", "flip flops"),"slip-on": ("footwear", "slip-ons"), "slip-ons": ("footwear", "slip-ons"),
    "odyssey react": ("footwear", "running shoes"),"legend react": ("footwear", "running shoes"),"pre-love": ("footwear", "sneakers"),"air force": ("footwear", "sneakers"),
    "drop-type": ("footwear", "running shoes"),"zoom rival fly": ("footwear", "running shoes"),"mx-720-818": ("footwear", "running shoes"),"tanjun": ("footwear", "running shoes"),
    "superstar": ("footwear", "sneakers"),"slip on": ("footwear", "slip-ons"),"lebron soldier": ("footwear", "basketball shoes"),"react element": ("footwear", "running shoes"),
    "free rn": ("footwear", "running shoes"),"zoom fly": ("footwear", "running shoes"),"zoom rise": ("footwear", "running shoes"),"tiempo legend": ("footwear", "soccer shoes"),
    "flex rn": ("footwear", "running shoes"),"air zoom structure": ("footwear", "running shoes"),"sfb gen 2": ("footwear", "boots"),"air huarache": ("footwear", "sneakers"),
    "wildhorse": ("footwear", "running shoes"),"benassi": ("footwear", "slides"),"terra kiger": ("footwear", "running shoes"),"classic cortez": ("footwear", "sneakers"),
    "renew run": ("footwear", "running shoes"),"free tr": ("footwear", "training shoes"),"lebron": ("footwear", "basketball shoes"),"mowabb": ("footwear", "sneakers"),
    "revolution": ("footwear", "running shoes"),"precision": ("footwear", "basketball shoes"),"shox": ("footwear", "running shoes"),"potential": ("footwear", "basketball shoes"),
    "epic react": ("footwear", "running shoes"), "react city": ("footwear", "running shoes"),"kawa": ("footwear", "slides"),"joyride run": ("footwear", "running shoes"),
    "joyride optik": ("footwear", "running shoes"),"flex contact": ("footwear", "running shoes"),"football": ("footwear", "Football Shoes"),"predator": ("footwear", "Football Shoes"),
    "vandalised": ("footwear", "Casual Shoes"),"canyon": ("footwear", "Casual Shoes"),"react": ("footwear", "Running Shoes"),"acg": ("footwear", "Outdoor Shoes"),
    "flex": ("footwear", "Training Shoes"),"signal": ("footwear", "Running Shoes"),"joyride": ("footwear", "Running Shoes"),"cortez": ("footwear", "Casual Shoes"),
    "hawkins": ("footwear", "Casual Shoes"),"nemeziz": ("footwear", "Football Shoes"),"indoor": ("footwear", "Indoor Shoes"),"outdoor": ("footwear", "Outdoor Shoes"),
    "trail": ("footwear", "Outdoor Shoes"),"superrep": ("footwear", "Training Shoes"),"zoom": ("footwear", "Running Shoes"),"tr": ("footwear", "Training Shoes"),
    "renew": ("footwear", "Running Shoes"),"ghost": ("footwear", "Running Shoes"),"racer": ("footwear", "Running Shoes"),"alphadunk": ("footwear", "Basketball Shoes"),
    "monarch": ("footwear", "Walking Shoes"),"af-1": ("footwear", "Casual Shoes"),"bella": ("footwear", "Casual Shoes"), "huarache": ("footwear", "Lifestyle Shoes"),
    "solarsoft": ("footwear", "Training Shoes"),"exp-x14": ("footwear", "Running Shoes"),"fly.by": ("footwear", "Basketball Shoes"),"xarr": ("footwear", "Training Shoes"),
    "skarn": ("footwear", "Casual Shoes"),"tailwind": ("footwear", "Running Shoes"), "air dsvm": ("footwear", "Running Shoes"),
    # category = accessories
    "sock": ("accessories", "socks"), "cap": ("accessories", "cap"),"hat": ("accessories", "cap"),"bag": ("accessories", "bag"),"backpack": ("accessories", "bag"),
    "watch": ("accessories", "watch")
    
}
    
def categorize_product(name):
    name = str(name).lower()
    for keyword, (cat, subcat) in category_map.items():
        if keyword in name:
            return cat, subcat
    return "Other", "Other"  # fallback if no keyword found

adidas_data[["category", "sub_category"]] = adidas_data["product_name"].apply(lambda x: pd.Series(categorize_product(x)))

In [None]:
# arranging the columns to be the same in all datasets
adidas_data = adidas_data[["product_name", "brand","category", "sub_category", "description"]]
print(adidas_data.columns)
data_details(adidas_data)

In [None]:
# 3- Loading Amazon Dataset from Opendatabay
amazon_data=data_download('https://storage.googleapis.com/kaggle-data-sets/2818963/4862520/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250912%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250912T150355Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=810f49074b0fc7b58e3eba124e0be1901631c22dd208c608fe434b27f1e86300966901b0d0bd18b5922c871b8c1ba0724f33bbb4a3c9dc8e793903e39a9bd33790b14c5ad26c9d22efba4cd159d69b9d281b61119e72d434a6635db83726b57646ea54090ee85862529e1fe5509a4dd59eeb09f45d9918e9d3faf2c181f5d9d90be38a2008908fb7eb2d38f59ec307baa526eeb1ab95776d8e1a6bd8d1b54590b81a7e2af59924213c67c9acd7b6b76fc01429d5a5f171abff7ecb59db94de62a2bff5c2f18b5f9bdebe5348fcca71f29be993192d1244941e948de9a012234610138ed316a29067f703da46fccd1a46dd4c81097fc3bb60b81e94e2615edd17')
data_details(amazon_data)


In [None]:
amazon_data.isnull().sum()

In [None]:
# there is no null values in the prefered dataset features
#  Amazon dataset don't contain brand , we note the first name in the product_name is the brand
# So creating a function to map the brand column with respect to product_name column

def map_brand(name):
    return name.split()[0]

# Apply function
amazon_data['brand'] =amazon_data['product_name'].apply(map_brand)

#  Amazon dataset don't contain sub_category , we note the values in category colums are diveded by | 
# So creating it by map sub_category column with respect to category column by extracting the most specific level(last part)

amazon_data['sub_category'] = amazon_data['category'].apply(lambda x: x.split('|')[-1])

amazon_data.head()

In [None]:
# arranging the columns to be the same in all datasets
amazon_data = amazon_data[["product_name", "brand","category", "sub_category", "about_product"]]
amazon_data= amazon_data.rename(columns={"about_product": "description"})
print(amazon_data.columns)
data_details(amazon_data)

In [None]:
# 4- Loading Flipkart Dataset from Opendatabay
flipkart_data=data_download("https://storage.googleapis.com/kaggle-data-sets/3477596/6075698/compressed/flipkart_fashion_products_dataset.json.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250912%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250912T142240Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=945702975af9d84228e964ca3fd1dc22ea0b07fe67f1488fdd9a735de3c2159cceb18dd7c3e1cbaeecfc20712984dfee767cc6c91cac2ebad9d3bf7cf09f99b88bc0560fdcdba843313a360191054f20e5f1f79d2a3ce89ecf1cceac55ac724d9ca6083d0e5251297cd4c15fc6dd63109bf75113ee89fddfdc29142b11ca7f3ea290ac4dd7b973a8c53e2e4827f3531afef1b38056a42fa2f71140b94f6e3dfc41dd899c9ceae191123eb5016b77a68fef2208c342c8df5026970b9b768fbdc7c113357b7dede06fab536232fc645c30b7aa84771547a2b6ad0a34880d501aa1e4d466eb6244969b21bc55d68586cf43adadd2fe1aa2eb52acbd53e2087d3c42")
data_details(flipkart_data)

In [None]:
# no null values found in Flipkart dataset
# arranging the columns to be the same in all datasets
flipkart_data = flipkart_data[["title", "brand","category", "sub_category", "description"]]
flipkart_data= flipkart_data.rename(columns={"title": "product_name"})
print(flipkart_data.columns)
data_details(flipkart_data)

In [None]:
# 5- Loading adidas2 Dataset from Opendatabay
adidas2_data=data_download('https://storage.googleapis.com/kaggle-data-sets/2570428/4373026/compressed/adidas_usa.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250912%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250912T170301Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=a1624a4b88f51f5343e607ef3d9f1e396edf7198a885749ab5fdb52a12eba0f0a9f65a25b375d12f951db868e01ae7b168513a1720fbc99d48ec80a3ec64f76f8a90cdb0e88fc9be7dde3dc27649f3be9b4c5f337a2a7956e0f76607a86345dd26f75e68ab5b0548272f20b3f94188f79864d2acaa66ffdce124cc635c392d5e39ff0c6d5f0d9f80207e4089716fca6ab705aaa3706bc4a29b2bc9d31edd4bc71cf57af5a5431698f691c7dde8f1ae052f484d0423090504c1b7ac4fc0eab9b6152d31d558bab58d51e3287ea4f3e9d542e35a5a8a39c40e49b9320d90102819383e42b8470bedea9e641c4db2fb2007551ca8388af76624bbc1ed25e8ba381f')
data_details(adidas2_data)

In [None]:
# adidas2 dataset not have null values
#  Noting the breadcrumbs colums contains sub_category
adidas2_data = adidas2_data[["name", "brand","category", "breadcrumbs", "description"]]
adidas2_data= adidas2_data.rename(columns={"name": "product_name" , "breadcrumbs":"sub_category"})
print(adidas2_data.columns)
data_details(adidas2_data)

In [None]:
# 6- Download elec data from github
elec_data=data_download('https://raw.githubusercontent.com/Eng-Shady-Hub/Generative_AI_Project_Round3/refs/heads/main/electronics_products_full_edit.xlsx')
print(elec_data)
data_details(elec_data)


In [None]:
# The elec_data dataSet is clear 
elec_data= elec_data.rename(columns={"Product_name": "product_name"})
print(elec_data.columns)

In [None]:
# Combining all datasets 

data_all = pd.concat([basket_data, adidas_data, amazon_data ,adidas2_data , flipkart_data , elec_data], ignore_index=True)

In [None]:
data_all.head()

In [None]:
data_all.shape

In [None]:
print(data_all.info())

In [None]:
data_all.sample(5)

In [None]:
save_path = os.path.expanduser("~/Documents/data_all.csv")
data_all.to_csv(save_path, index=False, encoding="utf-8")
print("Dataset Saved to : ", save_path)
