In [None]:
#!pip install openpyxl
#!pip install spacy
#!pip install requests
#!pip install pandas
#!pip install torch torchvision torchaudio
#!pip install scikit-learn
#!pip install openpyxl
#!pip install gensim



In [None]:
import requests
import zipfile
import io
import os
import re
import spacy
import numpy as np
import pandas as pd
import random
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
from torch.nn.utils.rnn import pad_sequence



# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Reproducibility & device
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
####    Genaral Functions   ####

# 1- Function for showing the dataframe characteristics

def data_details(df, n=5):

     print("\n")
     print("Shape:")
     print(df.shape)
     print("\n")

     print("\n The Head")
     display(df.head(n))
     print("\n")

     print("\n Info:")
     print(df.info(memory_usage="deep"))
     print("\n")

     print("\n The Null Values:")
     print(df.isnull().sum())
     print("\n")

     print("\nSummary statistics (categorical):")  # As our needed features are categorical
     categorical_cols = df.select_dtypes(include=[object]).columns
     if len(categorical_cols) > 0:
         display(df.describe(include=[object]))
     else:
         print("No categorical columns found.")
         print("\n")

# 2- Function for cleaning the DataFrame (Normalization)

def clean_text(text, mode="input" ,lowercase=True):
                                 # input (features [ product_name , brand , category , subcatogry] or the target [description])
    if pd.isna(text):
        return ""

    if lowercase:
        text = text.lower()

    text = re.sub(r"<.*?>", " ", text) # Remove HTML tags
    text = re.sub(r"(https?://\S+|www\.\S+|ftp://\S+)", " ", text) # Remove URLs

    if mode == "description":
        text = re.sub(r"[^a-z0-9\s&-]", " ", text) # For descriptions: keep only letters, numbers, spaces, &, -

    else:  # mode == "input features"

        text = re.sub(r"&", " & ", text) # Add spaces around &
        text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)  # Split camelCase or PascalCase
        text = re.sub(r"(\||/|>)", " > ", text) # Normalize separators (|, /, >) to " > " as some product names contains | symbole

        if lowercase: #
           text = re.sub(r"[^a-z0-9\s&'\->]", " ", text)  # Keep allowed chars: a-z, 0-9, spaces, &, -, >,'(lowercase for category & sub_catogry)
        else:
           text = re.sub(r"[^a-zA-Z0-9\s&'\->]", " ", text) # (uppercase for product_name & brand)

    text = re.sub(r"\s+", " ", text).strip()  # Normalize multiple spaces to single space
    text = re.sub(r"( > )+", " > ", text) # Normalize multiple > in a row
    text = text.strip(" >") # Remove leading/trailing >

    return text

# 3- function to apply tokenization - lemmatization - stopword/punctuation removal
# and keep the original casing for product names and brand

def tokenize_lemmatize(text, product_name=None, brand=None):
    if not text:
        return []

    preserve_tokens = set() # a set containing all the parts (tokens) of product name and brand.
    if product_name:
        preserve_tokens.update(product_name.split())
    if brand:
        preserve_tokens.update(brand.split())

    doc = nlp(text)

    tokens = []
    for token in doc:
        if token.text in preserve_tokens:
            tokens.append(token.text)  # keep original casing
        elif not token.is_stop and token.is_alpha:
            tokens.append(token.lemma_)  # lemmatize normal words
    return tokens



# 4- function to apply the clean_text and tokenize_lemmatize on our dataframe 
def preprocess_dataset_clean_only(df, for_model=False):

    clean_df = pd.DataFrame() # To return a new DataFrame with only cleaned columns

    feature_cols = ["product_name", "brand"] # Clean feature columns
    for col in feature_cols:
        clean_df[f"clean_{col}"] = df[col].apply(lambda x: clean_text(x, mode="input",lowercase=False )) # keep upercase

    feature_cols = [ "category", "sub_category"]  # Clean feature columns
    for col in feature_cols:
        clean_df[f"clean_{col}"] = df[col].apply(lambda x: clean_text(x, mode="input",lowercase=True )) # convert to lowercase


    clean_df["clean_description"] = df["description"].apply(lambda x: clean_text(x, mode="description", lowercase=True )) # Clean description

#  Replace lowercase product/brand mentions with their original casing that is mentioned in description
    for i, row in clean_df.iterrows():
         desc = clean_df.at[i, "clean_description"]

      # Handle product name parts
         product_name = row["clean_product_name"]
         if product_name:
             for token in product_name.split():
                 pattern = r"\b" + re.escape(token.lower()) + r"\b"
                 desc = re.sub(pattern, token, desc)

     # Handle brand
         brand = row["clean_brand"]
         if brand:
             for token in brand.split():
                 pattern = r"\b" + re.escape(token.lower()) + r"\b"
                 desc = re.sub(pattern, token, desc)

         clean_df.at[i, "clean_description"] = desc

    clean_df = clean_df.drop_duplicates(subset=["clean_description"])  # Drop duplicates and empty descriptions
    clean_df = clean_df[clean_df["clean_description"] != ""].reset_index(drop=True)

    # Merge 4 input features into one text

    clean_df["combined_input"] = (
        clean_df["clean_product_name"] + " " +
        clean_df["clean_brand"] + " " +
        clean_df["clean_category"] + " " +
        clean_df["clean_sub_category"]
    ).str.strip()

    clean_df = clean_df[["combined_input", "clean_description"]].copy()

    # If preparing for model... add tokenization + lemmatization

    if for_model:
        clean_df["input_tokens"] = clean_df.apply(
            lambda row: tokenize_lemmatize(
                row["combined_input"], row["clean_product_name"], row["clean_brand"]
            ),
            axis=1
        )

        clean_df["description_tokens"] = clean_df.apply(
            lambda row: tokenize_lemmatize(
                row["clean_description"], row["clean_product_name"], row["clean_brand"]
            ),
            axis=1
        )

       
        clean_df = clean_df[["input_tokens", "description_tokens"]].copy()
    else:
      clean_df = clean_df[["combined_input", "clean_description"]].copy()

    return clean_df
    

# 5- Function acts as a data preparation pipeline that transforms raw, messy text data into the structured, numerical format required for training the GAN model.  

def preprocess_and_split_data(df, preprocessor, test_size=0.2, val_size=0.1, random_state=42, max_len_target=200):


    print("\n Starting preprocessing and splitting...\n")

    # Step 1: Clean and tokenize using your existing function
    clean_df = preprocess_dataset_clean_only(df, for_model=True)
    print(f"Cleaning and tokenization complete. Rows: {len(clean_df)}")

    # Step 2: Build vocabulary from both input and description tokens
    preprocessor.build_vocab(clean_df["input_tokens"].tolist() + clean_df["description_tokens"].tolist())
    print(f"Vocabulary built. Vocab size: {len(preprocessor.word2idx)}")

    # Step 3: Convert tokens to encoded IDs
    input_ids = [preprocessor.encode(tokens) for tokens in clean_df["input_tokens"]]
    desc_ids = [preprocessor.encode(tokens) for tokens in clean_df["description_tokens"]]

    # Step 4: Split data into train, validation, and test before padding
    train_inputs, test_inputs, train_desc, test_desc = train_test_split(
        input_ids, desc_ids, test_size=test_size, random_state=random_state
    )

    val_ratio = val_size / (1 - test_size)
    train_inputs, val_inputs, train_desc, val_desc = train_test_split(
        train_inputs, train_desc, test_size=val_ratio, random_state=random_state
    )

    # Step 5: Pad sequences
    train_inputs = preprocessor.pad_sequences(train_inputs, max_len=max_len_target)
    val_inputs = preprocessor.pad_sequences(val_inputs, max_len=max_len_target)
    test_inputs = preprocessor.pad_sequences(test_inputs, max_len=max_len_target)


    train_desc = preprocessor.pad_sequences(train_desc, max_len=max_len_target)
    val_desc = preprocessor.pad_sequences(val_desc, max_len=max_len_target)
    test_desc = preprocessor.pad_sequences(test_desc, max_len=max_len_target)

    # Step 6: Create TensorDatasets (for PyTorch models)
    train_ds = torch.utils.data.TensorDataset(train_inputs, train_desc)
    val_ds = torch.utils.data.TensorDataset(val_inputs, val_desc)
    test_ds = torch.utils.data.TensorDataset(test_inputs, test_desc)

    print(f"\n Data split complete:")
    print(f"\n Train set: {len(train_ds)} samples")
    print(f"\n Validation set: {len(val_ds)} samples")
    print(f"\n Test set: {len(test_ds)} samples")

    # Step 7: Return everything needed for training
    return {
        "train": train_ds,
        "val": val_ds,
        "test": test_ds,
        "vocab_size": len(preprocessor.word2idx),
        "max_len": train_inputs.size(1),
        "clean_df": clean_df
    }



In [None]:
#  Word2Vec Training Separately

def train_word2vec(clean_df, vector_size=128, window=5, min_count=1, epochs=10):
    all_tokens = clean_df["input_tokens"].tolist() + clean_df["description_tokens"].tolist()
    w2v_model = Word2Vec(
        sentences=all_tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=4,
        sg=1,
        epochs=epochs
    )
    print("Word2Vec training complete. Vocab size:", len(w2v_model.wv))
    print(w2v_model.wv)
    return w2v_model

In [None]:
## This is class to serve as the Vocabulary Manager and Sequence Formatter for GAN

class TextGANPreprocessor:
    def __init__(self, min_freq=1):
        self.word2idx = {"<PAD>": 0, "<START>": 1, "<END>": 2, "<UNK>": 3}
        self.idx2word = {}
        self.min_freq = min_freq
        self.word_freq = {}

    def build_vocab(self, token_lists):
        for tokens in token_lists:
            for t in tokens:
                self.word_freq[t] = self.word_freq.get(t, 0) + 1
        for word, freq in self.word_freq.items():
            if freq >= self.min_freq and word not in self.word2idx:
                self.word2idx[word] = len(self.word2idx)
        self.idx2word = {i: w for w, i in self.word2idx.items()}

    def encode(self, tokens):
        return [self.word2idx.get(t, self.word2idx["<UNK>"]) for t in tokens]


    def pad_sequences(self, sequences, max_len=200): # Set default to 200
        if max_len is None: 
             max_len = max(len(s) for s in sequences)

    # Ensure we don't exceed 200 if data is longer (it will be truncated)
        padded = [s[:max_len] + [0] * max(0, max_len - len(s)) for s in sequences]
        return torch.tensor(padded, dtype=torch.long)

   # helper for generation & prompts
    def encode_text(self, text):
        toks = tokenize_lemmatize(clean_text(text, mode="input", lowercase=True), None, None)
        return self.encode(toks)

    def decode_text(self, ids):
        words = []
        for idx in ids:
            w = self.idx2word.get(int(idx), "<UNK>")
            if w in ("<PAD>", "<START>", "<END>"):
                continue
            words.append(w)
        return " ".join(words)

## Generator class to generate realistic text sequences (product descriptions) conditioned on specific features  and random noise.        

class Generator(nn.Module):
     def __init__(self, vocab_size, embed_dim, hidden_dim, cond_dim, max_len, embedding_matrix=None, freeze_embeddings=False, dropout=0.3):
        super().__init__()
        self.max_len = max_len
        self.hidden_dim = hidden_dim
        self.cond_dim = cond_dim

        self.embed = nn.Embedding(vocab_size, embed_dim)
        if embedding_matrix is not None:
            self.embed.weight.data.copy_(embedding_matrix)
            self.embed.weight.requires_grad = not freeze_embeddings

        # LSTM Encoder for the conditional sequence
        self.cond_encoder = nn.LSTM(embed_dim, hidden_dim // 2,batch_first=True, bidirectional=True)

        # Linear projection to initial hidden state
        self.cond_fc = nn.Linear(hidden_dim + cond_dim, hidden_dim)

        # Decoder LSTM for text generation
        self.num_layers = 2
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=dropout, num_layers=self.num_layers)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

     def forward(self, noise, cond_seq, teacher_forcing_ratio=0.0, targets=None):
        device = cond_seq.device
        batch_size = cond_seq.size(0)

        # --- Encode condition sequence ---
        cond_emb = self.embed(cond_seq)
        _, (h, c) = self.cond_encoder(cond_emb)
        cond_vec = torch.cat((h[-2], h[-1]), dim=1)

        combined = torch.cat((cond_vec, noise), dim=1)
        h0_single = torch.tanh(self.cond_fc(combined))

        h0 = h0_single.unsqueeze(0).repeat(self.num_layers, 1, 1)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=device)

        # --- Start token ---
        start_token_idx = torch.full((batch_size, 1), 1, dtype=torch.long, device=device) # <START>

        # We will feed the *embedding* in the loop, not the index
        next_input_emb = self.embed(start_token_idx)
        h, c = h0, c0

        outputs = []

        for t in range(self.max_len):
            # Use the embedding from the *previous* step
            out, (h, c) = self.lstm(next_input_emb, (h, c))
            logits = self.fc_out(out[:, -1, :]) # Get logits [B, V]
            outputs.append(logits)

            # Decide on the *next* input
            use_teacher_force = targets is not None and torch.rand(1).item() < teacher_forcing_ratio

            if use_teacher_force:
                # Teacher Forcing 
                next_token_idx = targets[:, t].unsqueeze(1)
                next_input_emb = self.embed(next_token_idx)
            else:
                # Get soft probabilities
                soft_probs = F.gumbel_softmax(logits, tau=1.0, hard=False, dim=1) 
                next_input_emb = torch.matmul(soft_probs, self.embed.weight).unsqueeze(1) 

        logits_stack = torch.stack(outputs, dim=1) 
        return logits_stack


# Discriminator class acts as the adversary to the Generator, trained to distinguish between real (authentic product descriptions) and fake (Generator-produced) text sequences.

class Discriminator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, embedding_matrix=None, freeze_embeddings=False):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        if embedding_matrix is not None:
            self.embed.weight.data.copy_(embedding_matrix)
            self.embed.weight.requires_grad = not freeze_embeddings

        # Bidirectional LSTM encoder
        self.lstm = nn.LSTM(embed_dim, hidden_dim // 2, num_layers=1, batch_first=True, bidirectional=True)

        # Classification layers
        self.fc = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, 1)
        )

    # *** CORRECTED FORWARD ***
    def forward(self, seq):
        if seq.dim() == 2:
            emb = self.embed(seq) 
        elif seq.dim() == 3:
             emb = torch.matmul(seq, self.embed.weight)
        else:
            raise ValueError(f"Discriminator input has wrong dimension: {seq.dim()}")

        _, (h, _) = self.lstm(emb)              
        hidden = torch.cat((h[-2], h[-1]), dim=1) 
        out = self.fc(hidden)                     
        return out


# Training Loop of GAN

def train_gan(generator, discriminator, dataloader, num_epochs, vocab_size, cond_dim, preprocessor, lr_G=2e-4, lr_D=1e-4):

    device = next(generator.parameters()).device
    criterion = nn.BCEWithLogitsLoss()  # Handles sigmoid internally


    optim_G = torch.optim.Adam(generator.parameters(), lr=lr_G, betas=(0.5, 0.999)) #  Faster
    optim_D = torch.optim.Adam(discriminator.parameters(), lr=lr_D, betas=(0.5, 0.999)) # Slower

    best_g_loss = float('inf')  # Track best generator loss
    gen_steps = 2  #  train generator more often

    for epoch in range(num_epochs):
        generator.train()
        discriminator.train()

        G_loss_total, D_loss_total = 0.0, 0.0  # reset every epoch

        for i, (input_seq, real_desc) in enumerate(dataloader):
            input_seq = input_seq.to(device)
            real_desc = real_desc.to(device)
            batch_size = input_seq.size(0)

            # Label smoothing
            real_labels = torch.full((batch_size, 1), 0.9, device=device)
            fake_labels = torch.full((batch_size, 1), 0.0, device=device)

            #  Train Discriminator 
            optim_D.zero_grad()

            # Real samples
            real_output = discriminator(real_desc)
            loss_real = criterion(real_output, real_labels)

            # Generate fake samples
            noise = torch.randn(batch_size, cond_dim, device=device)
            with torch.no_grad(): # Don't need G grads here
                fake_logits = generator(noise, input_seq)

            fake_one_hot = F.gumbel_softmax(fake_logits, tau=1.0, hard=True, dim=2)

            # Fake samples (detached, as we're training D)
            fake_output = discriminator(fake_one_hot.detach())
            loss_fake = criterion(fake_output, fake_labels)

            # Total discriminator loss
            loss_D = (loss_real + loss_fake) / 2
            loss_D.backward()
            torch.nn.utils.clip_grad_norm_(discriminator.parameters(), 5.0)  # gradient clipping
            optim_D.step()

            # Train Generator 
            for _ in range(gen_steps):
                optim_G.zero_grad()
                noise = torch.randn(batch_size, cond_dim, device=device)

                # Generate fake logits
                fake_logits = generator(noise, input_seq)

                # *** CORRECTION: Use Gumbel-Softmax for differentiable output ***
                # We need gradients to flow back, so no .detach()
                fake_one_hot = F.gumbel_softmax(fake_logits, tau=1.0, hard=True, dim=2)

                # Get discriminator's opinion
                fake_output = discriminator(fake_one_hot)

                # Calculate loss (how well G fooled D)
                loss_G = criterion(fake_output, real_labels) # G wants D to think these are real
                loss_G.backward()
                torch.nn.utils.clip_grad_norm_(generator.parameters(), 5.0)  # clip gradients
                optim_G.step()

            # Accumulate for logging
            G_loss_total += loss_G.item()
            D_loss_total += loss_D.item()

            # Optional: print every 100 batches
            if (i + 1) % 100 == 0:
                print(f"   [Batch {i+1}/{len(dataloader)}] D Loss: {loss_D.item():.4f} | G Loss: {loss_G.item():.4f}")

        # Save best generator & discriminator model ---
        avg_G = G_loss_total / len(dataloader)
        avg_D = D_loss_total / len(dataloader)

        if avg_G < best_g_loss:
            best_g_loss = avg_G
            torch.save(generator.state_dict(), "best_generator.pth")
            torch.save(discriminator.state_dict(), "best_discriminator.pth")
            # Print average loss
            print(f"Saved Best Models at Epoch {epoch+1} (G Loss: {avg_G:.4f})")

        # Epoch summary 
        print(f"Epoch [{epoch+1}/{num_epochs}] | D Loss: {avg_D:.4f} | G Loss: {avg_G:.4f}")

        # Generate a sample after each epoch 
        generator.eval()
        with torch.no_grad():
            try:
                # Get a sample from the dataloader (use first batch)
                sample_input, _ = next(iter(dataloader))
                sample_input = sample_input.to(device)
                noise = torch.randn(1, cond_dim, device=device) # Use 1 sample

                fake_logits = generator(noise, sample_input[:1])
                fake_ids = torch.argmax(fake_logits, dim=2)

                generated_text = preprocessor.decode_text(fake_ids[0].tolist())
                print(f"Example generated description after Epoch {epoch+1}:")
                print(generated_text)
            except Exception as e:
                print(f"Could not generate sample: {e}")
        generator.train()


# Generate Descriptions After Training

def generate_description(generator, preprocessor, input_seq, cond_dim, device, max_len=50):
    """
    Generate a product description from an input sequence using the trained Generator.
    """
    generator.eval()
    with torch.no_grad():
        noise = torch.randn(1, cond_dim, device=device)

        fake_logits = generator(noise, input_seq)

        # We use argmax here because we are in inference, no gradients needed
        fake_ids = torch.argmax(fake_logits, dim=2)[0].tolist()

        # Decode and truncate
        generated_text = preprocessor.decode_text(fake_ids)
        return " ".join(generated_text.split()[:max_len]) # Truncate to max_len words

In [None]:
### Download Data From Github

zip_url = 'https://raw.githubusercontent.com/Eng-Shady-Hub/Generative_AI_Project_Round3/refs/heads/main/All_Datasets2.zip'
response = requests.get(zip_url)
response.raise_for_status()

In [None]:
dataframes = {}

# Open the ZIP file from memory
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # Collect only CSV files
    csv_files = [f for f in z.namelist() if f.lower().endswith(".csv")]

    if not csv_files:
        print("No CSV files found in the ZIP.")
    else:
        for i, file_name in enumerate(csv_files, start=1):
            key = f"df{i}"
            try:
                with z.open(file_name) as f:
                    # Try UTF-8 first; fallback to latin1 if decoding fails
                    try:
                        dataframes[key] = pd.read_csv(f, encoding='utf-8')
                    except UnicodeDecodeError:
                        f.seek(0)
                        dataframes[key] = pd.read_csv(f, encoding='latin1')

                    print(f'DataFrame "{key}" created from file: {file_name} (shape: {dataframes[key].shape})')

            except Exception as e:
                print(f"Error reading {file_name}: {e}")

In [None]:
# Dataframe 1
basket_data = dataframes["df1"]
data_details(basket_data)

In [None]:
# target = [description]
# features = [product_name ,brand ,category, sub_category]

basket_data = basket_data[["product","brand","category" , "sub_category","description"]]
basket_data= basket_data.rename(columns={"product": "product_name"})

print(basket_data.columns)

In [None]:
# there are null values in product_name , brand & description columns

basket_data=basket_data.dropna(subset=["description","product_name","brand"])
data_details(basket_data)

In [None]:
# DataFrame 2

adidas_data = dataframes["df2"]
data_details(adidas_data)

In [None]:
# Unify columns names
adidas_data = adidas_data[["Product Name", "Brand", "Description"]].rename(columns={"Product Name": "product_name", "Brand": "brand" , "Description":"description"})
adidas_data.info()

# there are null values(only 3) in description column

adidas_data=adidas_data.dropna(subset=["description"])
adidas_data =adidas_data[adidas_data['description'] != 'No description']
adidas_data.isnull().sum()

In [None]:
# Dataset has agood description to our model but not have category & sub_category columns
# So we map category & sub_category columns with respect to the product_name column

category_map = {
    # cayrgory = footwear
  "shoe": ("footwear", "shoes"),"sneaker": ("footwear", "shoes"),"running": ("footwear", "running shoes"),"trainer": ("footwear", "trainers"),"cleat": ("footwear", "cleats"),
    "slipper": ("footwear", "slippers"),"flip flop": ("footwear", "flip flops"),"jordan": ("footwear", "basketball shoes"),"retro": ("footwear", "shoes"),
    "phantom": ("footwear", "cleats"),"venom": ("footwear", "cleats"),"mercurial": ("footwear", "soccer shoes"),"superfly": ("footwear", "soccer shoes"),
    "tf": ("footwear", "turf soccer shoes"),"air max": ("footwear", "sneakers"),"p-6000": ("footwear", "running shoes"),"sandal": ("footwear", "sandals"),
    "slide": ("footwear", "slides"),"adilette": ("footwear", "slides"),"flipflop": ("footwear", "flip flops"),"sb": ("footwear", "skate shoes"),"skate": ("footwear", "skate shoes"),
    "chron": ("footwear", "skate shoes"),"kd": ("footwear", "basketball shoes"),"kyrie": ("footwear", "basketball shoes"),"iconclash": ("footwear", "running shoes"),
    "daybreak": ("footwear", "sneakers"),"blazer": ("footwear", "sneakers"),"prelove": ("footwear", "sneakers"),"pegasus": ("footwear", "running shoes"),
    "vaporfly": ("footwear", "running shoes"),"zoomx": ("footwear", "running shoes"),"slipon": ("footwear", "slip-ons"),"airforce": ("footwear", "sneakers"),
    "airmax": ("footwear", "sneakers"),"metcon": ("footwear", "training shoes"),"court": ("footwear", "tennis shoes"),"pg": ("footwear", "basketball shoes"),
    "m2k": ("footwear", "sneakers"),"winflo": ("footwear", "running shoes"),"vomero": ("footwear", "running shoes"),"vapormax": ("footwear", "lifestyle sneakers"),
    "flip-flop": ("footwear", "flip flops"),"flip-flops": ("footwear", "flip flops"),"slip-on": ("footwear", "slip-ons"), "slip-ons": ("footwear", "slip-ons"),
    "odyssey react": ("footwear", "running shoes"),"legend react": ("footwear", "running shoes"),"pre-love": ("footwear", "sneakers"),"air force": ("footwear", "sneakers"),
    "drop-type": ("footwear", "running shoes"),"zoom rival fly": ("footwear", "running shoes"),"mx-720-818": ("footwear", "running shoes"),"tanjun": ("footwear", "running shoes"),
    "superstar": ("footwear", "sneakers"),"slip on": ("footwear", "slip-ons"),"lebron soldier": ("footwear", "basketball shoes"),"react element": ("footwear", "running shoes"),
    "free rn": ("footwear", "running shoes"),"zoom fly": ("footwear", "running shoes"),"zoom rise": ("footwear", "running shoes"),"tiempo legend": ("footwear", "soccer shoes"),
    "flex rn": ("footwear", "running shoes"),"air zoom structure": ("footwear", "running shoes"),"sfb gen 2": ("footwear", "boots"),"air huarache": ("footwear", "sneakers"),
    "wildhorse": ("footwear", "running shoes"),"benassi": ("footwear", "slides"),"terra kiger": ("footwear", "running shoes"),"classic cortez": ("footwear", "sneakers"),
    "renew run": ("footwear", "running shoes"),"free tr": ("footwear", "training shoes"),"lebron": ("footwear", "basketball shoes"),"mowabb": ("footwear", "sneakers"),
    "revolution": ("footwear", "running shoes"),"precision": ("footwear", "basketball shoes"),"shox": ("footwear", "running shoes"),"potential": ("footwear", "basketball shoes"),
    "epic react": ("footwear", "running shoes"), "react city": ("footwear", "running shoes"),"kawa": ("footwear", "slides"),"joyride run": ("footwear", "running shoes"),
    "joyride optik": ("footwear", "running shoes"),"flex contact": ("footwear", "running shoes"),"football": ("footwear", "Football Shoes"),"predator": ("footwear", "Football Shoes"),
    "vandalised": ("footwear", "Casual Shoes"),"canyon": ("footwear", "Casual Shoes"),"react": ("footwear", "Running Shoes"),"acg": ("footwear", "Outdoor Shoes"),
    "flex": ("footwear", "Training Shoes"),"signal": ("footwear", "Running Shoes"),"joyride": ("footwear", "Running Shoes"),"cortez": ("footwear", "Casual Shoes"),
    "hawkins": ("footwear", "Casual Shoes"),"nemeziz": ("footwear", "Football Shoes"),"indoor": ("footwear", "Indoor Shoes"),"outdoor": ("footwear", "Outdoor Shoes"),
    "trail": ("footwear", "Outdoor Shoes"),"superrep": ("footwear", "Training Shoes"),"zoom": ("footwear", "Running Shoes"),"tr": ("footwear", "Training Shoes"),
    "renew": ("footwear", "Running Shoes"),"ghost": ("footwear", "Running Shoes"),"racer": ("footwear", "Running Shoes"),"alphadunk": ("footwear", "Basketball Shoes"),
    "monarch": ("footwear", "Walking Shoes"),"af-1": ("footwear", "Casual Shoes"),"bella": ("footwear", "Casual Shoes"), "huarache": ("footwear", "Lifestyle Shoes"),
    "solarsoft": ("footwear", "Training Shoes"),"exp-x14": ("footwear", "Running Shoes"),"fly.by": ("footwear", "Basketball Shoes"),"xarr": ("footwear", "Training Shoes"),
    "skarn": ("footwear", "Casual Shoes"),"tailwind": ("footwear", "Running Shoes"), "air dsvm": ("footwear", "Running Shoes"),
    # category = accessories
    "sock": ("accessories", "socks"), "cap": ("accessories", "cap"),"hat": ("accessories", "cap"),"bag": ("accessories", "bag"),"backpack": ("accessories", "bag"),
    "watch": ("accessories", "watch")
    }

def categorize_product(name):
    name = str(name).lower()
    for keyword, (cat, subcat) in category_map.items():
        if keyword in name:
            return cat, subcat
    return "Other", "Other"  # fallback if no keyword found

adidas_data[["category", "sub_category"]] = adidas_data["product_name"].apply(lambda x: pd.Series(categorize_product(x)))

In [None]:
# arranging the columns to be the same in all datasets

adidas_data = adidas_data[["product_name", "brand","category", "sub_category", "description"]]
print(adidas_data.columns)
data_details(adidas_data , n=20)

In [None]:
# DataFrame 3

amazon_data =dataframes["df3"]
data_details(amazon_data)

In [None]:
# there is no null values in the prefered dataset features
#  Amazon dataset don't contain brand , we note the first name in the product_name is the brand
# So creating a function to map the brand column with respect to product_name column

def map_brand(name):
    return name.split()[0]

# Apply function
amazon_data['brand'] =amazon_data['product_name'].apply(map_brand)

#  Amazon dataset don't contain sub_category , we note the values in category colums are diveded by |
# So creating it by map sub_category column with respect to category column by extracting the most specific level(last part)

amazon_data['sub_category'] = amazon_data['category'].apply(lambda x: x.split('|')[-1])

amazon_data.head()

In [None]:
# arranging the columns to be the same in all datasets

amazon_data = amazon_data[["product_name", "brand","category", "sub_category", "about_product"]]
amazon_data= amazon_data.rename(columns={"about_product": "description"})
print(amazon_data.columns)
data_details(amazon_data)

In [None]:
# DataFrame 4

flipkart_data=dataframes["df4"]
data_details(flipkart_data)

In [None]:
# there are nulls in description an brand columns
# clearing "discription" rows with missed values

flipkart_data=flipkart_data.dropna(subset=["description"])
flipkart_data =flipkart_data[flipkart_data['description'] != 'No description']
flipkart_data.isnull().sum()



In [None]:
# display the most common brand to fill the missing value

most_common = flipkart_data['brand'].mode()[0]
print(most_common)

In [None]:
# filling the missed value of brand By common brand in our dataset "REEB"

flipkart_data['brand'].fillna("REEB", inplace=True)

In [None]:
# arranging the columns to be the same in all datasets

flipkart_data = flipkart_data[["title", "brand","category", "sub_category", "description"]]
flipkart_data= flipkart_data.rename(columns={"title": "product_name"})
print(flipkart_data.columns)
flipkart_data.isnull().sum()

In [None]:
# DataFrame 5

adidas2_data =dataframes["df5"]
data_details(adidas2_data)

In [None]:
# adidas2 dataset not have null values
#  Noting the breadcrumbs colums contains sub_category

adidas2_data = adidas2_data[["name", "brand","category", "breadcrumbs", "description"]]
adidas2_data= adidas2_data.rename(columns={"name": "product_name" , "breadcrumbs":"sub_category"})
print(adidas2_data.columns)
data_details(adidas2_data)

In [None]:
# DataFrame 6

elec_data = dataframes["df6"]
data_details(elec_data)


In [None]:
# The elec_data dataSet is clear

elec_data= elec_data.rename(columns={"Product_name": "product_name"})
print(elec_data.columns)

In [None]:
print(os.getcwd())


In [None]:
# DataFrame 7

Bigbasket2 = dataframes["df7"]
data_details(Bigbasket2)

In [None]:
Bigbasket2=Bigbasket2[["SKU Name","Brand","Category","Sub-Category","About the Product"]]
data_details(Bigbasket2)

In [None]:
# rename the column and clean the row with null or no description

Bigbasket2=Bigbasket2.rename(columns={"SKU Name": "product_name" , "Brand":"brand","Category":"category", "Sub-Category":"sub_category" ,"About the Product":"description"})
Bigbasket2=Bigbasket2.dropna(subset=["description"])
Bigbasket2 =Bigbasket2[Bigbasket2['description'] != 'No description']
Bigbasket2.isnull().sum()

In [None]:
# the brand column has only 3 null values

Bigbasket2=Bigbasket2.dropna(subset=["brand"])

In [None]:
data_details(Bigbasket2)

In [None]:
# Combining all datasets

Final_data = pd.concat([basket_data, adidas_data, amazon_data ,adidas2_data , flipkart_data , elec_data  ,Bigbasket2], ignore_index=True)

In [None]:
data_details(Final_data)

In [None]:
preproc = TextGANPreprocessor()

# Run full preprocessing and split
data_splits = preprocess_and_split_data(Final_data, preproc, max_len_target=200)

# Access ready DataLoaders
# Using a small batch size for demonstration
BATCH_SIZE = 16
train_loader = DataLoader(data_splits["train"], batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(data_splits["val"], batch_size=BATCH_SIZE)
test_loader = DataLoader(data_splits["test"], batch_size=BATCH_SIZE)

vocab_size = data_splits["vocab_size"]
max_len = data_splits["max_len"]

print(f"\n--- Model Hyperparameters ---")
print(f"Device: {device}")
print(f"Vocab Size = {vocab_size}")
print(f"Max Sequence Length = {max_len}")
print(f"Batch Size = {BATCH_SIZE}")
print(f"-----------------------------")


# Train Word2Vec & Build Embedding Matrix
EMBED_DIM = 128
w2v_model = train_word2vec(
        data_splits["clean_df"],
        vector_size=EMBED_DIM,
        window=5,
        min_count=1,
        epochs=10
)

# Build embedding matrix 
print("Building embedding matrix...")
embedding_matrix = np.random.randn(vocab_size, EMBED_DIM)
for word, idx in preproc.word2idx.items():
    if word in w2v_model.wv:
          embedding_matrix[idx] = w2v_model.wv[word]
embedding_matrix = torch.tensor(embedding_matrix,dtype=torch.float)
print("Embedding matrix built.")

# Initialize models
HIDDEN_DIM = 256
COND_DIM = 128

print("Initializing models...")
generator = Generator(
        vocab_size=vocab_size,
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        cond_dim=COND_DIM,
        max_len=max_len,
        embedding_matrix=embedding_matrix
).to(device)

discriminator = Discriminator(
        vocab_size=vocab_size,
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        embedding_matrix=embedding_matrix
).to(device)
print("Models initialized.")

# Train GAN
print("\n--- Starting GAN Training ---")
train_gan(
        generator=generator,
        discriminator=discriminator,
        dataloader=train_loader,
        num_epochs=20, 
        vocab_size=vocab_size,
        cond_dim=COND_DIM,
        preprocessor=preproc
)
print("--- GAN Training Complete ---")

# Generate one description
print("\nGenerating a sample description from the validation set...")

# Load the best generator
try:
        generator.load_state_dict(torch.load("best_generator.pth", map_location=device))
        print("Loaded best generator weights.")
except FileNotFoundError:
        print("No saved generator found. Using last epoch's weights.")

sample_input, _ = next(iter(val_loader))
sample_input_single = sample_input[0].unsqueeze(0).to(device)

# CORRECTED FUNCTION CALL 
generated_description = generate_description(
        generator=generator,
        preprocessor=preproc,
        input_seq=sample_input_single,
        cond_dim=COND_DIM,
        device=device,
        max_len=max_len # Use max_len for output truncation
)

original_input_text = preproc.decode_text(sample_input_single[0].tolist())
print(f"\n Source Input Text:\n{original_input_text}")
print(f"\n Generated Description:\n{generated_description}")

