In [1]:
import pandas as pd
import os
import zipfile
from pathlib import Path
from google.colab import files, drive
from sentence_transformers import SentenceTransformer

In [2]:
# --- Configuration ---
COMPETITION_NAME = "h-and-m-personalized-fashion-recommendations"
# This is where we want the data to be available for our Spark code
LOCAL_DATA_DIR = Path("/content/hm_data")

# This is where the persistent data lives in your Google Drive
# NOTE: Ensure this path matches where you saved it last time!
DRIVE_MOUNT_POINT = "/content/drive"
DRIVE_PROJECT_PATH = Path(DRIVE_MOUNT_POINT) / "MyDrive/ScaleStyle_Project/data"
# --- Configuration ---
COMPETITION_NAME = "h-and-m-personalized-fashion-recommendations"
# This is where we want the data to be available for our Spark code
LOCAL_DATA_DIR = Path("/content/hm_data")

# This is where the persistent data lives in your Google Drive
# NOTE: Ensure this path matches where you saved it last time!
DRIVE_MOUNT_POINT = "/content/drive"
DRIVE_PROJECT_PATH = Path(DRIVE_MOUNT_POINT) / "MyDrive/ScaleStyle_Project/data"
ZIP_FILE_PATH = DRIVE_PROJECT_PATH / f"{COMPETITION_NAME}.zip"

In [3]:
def setup_data_from_drive():
    """
    Mounts Google Drive and prepares the dataset.
    1. Mounts Drive.
    2. Checks if the ZIP file exists in Drive.
    3. Extracts CSVs to the local Colab environment for fast access.
    """

    # 1. Mount Google Drive
    if not os.path.exists(DRIVE_MOUNT_POINT):
        print("üîπ Mounting Google Drive...")
        drive.mount(DRIVE_MOUNT_POINT)
    else:
        print("‚úÖ Google Drive is already mounted.")

    # 2. Check for the dataset in Drive
    if not ZIP_FILE_PATH.exists():
        print(f"‚ùå Error: Dataset not found at {ZIP_FILE_PATH}")
        print("   Did you save it to a different folder last time?")
        print("   If this is a fresh start, you may need to run the download script once.")
        return

    print(f"‚úÖ Found cached dataset in Drive: {ZIP_FILE_PATH}")

    # 3. Extract to local environment (Faster IO than reading from Drive directly)
    # We only extract if the target directory is empty or missing
    if not LOCAL_DATA_DIR.exists():
        print("üîπ Extracting data to local Colab environment (this improves speed)...")
        LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

        # Unzip command is faster than Python zipfile
        zip_source = str(ZIP_FILE_PATH)
        target_dir = str(LOCAL_DATA_DIR)

        # Extract specific CSVs to save space/time
        !unzip -q -o "{zip_source}" "articles.csv" "customers.csv" "transactions_train.csv" -d "{target_dir}"

        print(f"‚úÖ Extraction complete! Data is ready at: {LOCAL_DATA_DIR}")
    else:
        print(f"‚úÖ Data already extracted at {LOCAL_DATA_DIR}. Skipping extraction.")

    print(f"üìÇ Available files: {os.listdir(LOCAL_DATA_DIR)}")


# --- Execution ---
if __name__ == "__main__":
    setup_data_from_drive()

‚úÖ Google Drive is already mounted.
‚úÖ Found cached dataset in Drive: /content/drive/MyDrive/ScaleStyle_Project/data/h-and-m-personalized-fashion-recommendations.zip
‚úÖ Data already extracted at /content/hm_data. Skipping extraction.
üìÇ Available files: ['customers.csv', 'articles.csv', 'transactions_train.csv']


In [4]:
from google.colab import userdata

# 1. Securely retrieve Token (will not be displayed on screen)
try:
    token = userdata.get("GITHUB_TOKEN")
    print("‚úÖ Token retrieved successfully")
except Exception:
    print("‚ùå Token not found. Please check the Secrets panel settings on the left")

# 2. Configure repository information
username = "EthanGaoZhiyuan"
repo = "ScaleStyle"

# 3. Construct HTTPS URL with Token (Token used for authentication)
# Format: https://token@github.com/username/repo.git
clone_url = f"https://{token}@github.com/{username}/{repo}.git"

# 4. Execute clone command
# Use the -b flag to directly clone a specific branch
branch = "feat/phase2-week2-reranker-pipeline"
!git clone -b {branch} {clone_url}

# 5. Verification
if os.path.exists(repo):
    print(f"\nüéâ Code downloaded to: {repo}")
    !ls {repo}
else:
    print("\n‚ùå Clone failed. Please check Token permissions or repository existence")

‚úÖ Token retrieved successfully
fatal: destination path 'ScaleStyle' already exists and is not an empty directory.

üéâ Code downloaded to: ScaleStyle
data-pipeline	    docs	     inference-service	README.md
docker-compose.yml  gateway-service  infrastructure


In [5]:
import sys
import os

# 1. Change directory to the data-pipeline folder to access requirements.txt
# This ensures pip installs the exact versions specified in your repo.
repo_path = "/content/ScaleStyle/data-pipeline"
os.chdir(repo_path)

print(f"üìÇ Current Working Directory: {os.getcwd()}")

# 2. Install dependencies
# Using -q to keep the output clean.
print("‚¨áÔ∏è Installing dependencies from requirements.txt...")
!pip install -q -r requirements.txt

# 3. Add the project root to system path
# This allows Python to recognize 'src' as a module so we can do:
# "from src.feature_engineering import ..."
if repo_path not in sys.path:
    sys.path.append(repo_path)
    print(f"‚úÖ Added {repo_path} to system path.")

print("‚úÖ Environment setup complete.")

üìÇ Current Working Directory: /content/ScaleStyle/data-pipeline
‚¨áÔ∏è Installing dependencies from requirements.txt...
‚úÖ Added /content/ScaleStyle/data-pipeline to system path.
‚úÖ Environment setup complete.


# GENERATE EMBEDDINGS WITH BAAI/BGE-LARGE (INDUSTRIAL STANDARD)

**MODEL SELECTION RATIONALE:**
I switched to **"BAAI/bge-large-en-v1.5"** for the following reasons, targeting a more production-ready architecture:

1.  **Industrial Standard (Efficiency vs. Performance)**: Unlike the 7B parameter Qwen2, BGE-Large has only **~335M parameters**. This represents the "Gold Standard" in the industry, allowing for ultra-low latency inference while maintaining top-tier retrieval quality.
2.  **MTEB Leaderboard**: It consistently ranks high on the Massive Text Embedding Benchmark (MTEB), specifically for English retrieval tasks.
3.  **Encoder Architecture (BERT-based)**: This uses a standard BERT-like architecture with a **512 token limit**, which is sufficient for product names and descriptions. It uses `[CLS]` pooling, which is structurally more suitable for vector search than Decoder-based LLMs.
4.  **Hardware Optimization**: Due to its smaller size, we can significantly increase the **Batch Size (e.g., to 256)**, making the data processing pipeline 10x faster than before.

In [6]:
!pip install -U transformers



In [7]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm.auto import tqdm

# --- Configuration ---
MODEL_NAME = "BAAI/bge-large-en-v1.5"
BATCH_SIZE = 256
MAX_LENGTH = 512

# Input/Output Paths (Google Drive)
INPUT_PATH = "/content/drive/MyDrive/ScaleStyle_Project/data/processed/articles_parquet"
TRANSACTIONS_PATH = "/content/drive/MyDrive/ScaleStyle_Project/data/processed/transactions_parquet"
OUTPUT_PATH = "/content/drive/MyDrive/ScaleStyle_Project/data/processed/article_embeddings_bge_v2.parquet"

# def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
#     """
#     Extracts the embedding from the last token.
#     For Causal LLMs (like Qwen) used as embedding models,
#     pooling the last token (EOS) is the standard practice, unlike BERT which uses [CLS].
#     """
#     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
#     if left_padding:
#         return last_hidden_states[:, -1]
#     else:
#         sequence_lengths = attention_mask.sum(dim=1) - 1
#         batch_size = last_hidden_states.shape[0]
#         # Gather the hidden state corresponding to the last real token
#         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def cls_pooling(model_output):
    """
    BGE/BERT Standard Pooling: Take the first token ([CLS])
    """
    return model_output.last_hidden_state[:, 0]

if __name__ == "__main__":
    print(f"Initializing SOTA model: {MODEL_NAME} on GPU...")

    # 1. Load Model & Tokenizer
    # Note: 'trust_remote_code=True' is strictly required for Qwen2 custom architecture.
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModel.from_pretrained(
        MODEL_NAME,
        trust_remote_code=False,
        torch_dtype=torch.float16,  # Use FP16 to save VRAM and accelerate on A100
        device_map="cuda"
    )

    # 2. Load Data
    print(f"Reading article data from: {INPUT_PATH}")
    df_tx = pd.read_parquet(TRANSACTIONS_PATH, columns=["article_id", "price"])
    print("transactions rows:", len(df_tx), "cols:", df_tx.columns.tolist())


    df_price = (
        df_tx.groupby("article_id", as_index=False)["price"]
        .mean()
        .rename(columns={"price": "price"})
    )

    df_articles = pd.read_parquet(INPUT_PATH)

    df_articles = df_articles.merge(df_price, on="article_id", how="left")

    # 3. Construct Input Text
    # Since I'm embedding documents (products), I use the raw text.
    # Instruction prompts (e.g., "Given a query...") are only needed during the retrieval phase.

    # Preprocessing
    df_articles['prod_name'] = df_articles['prod_name'].fillna("Unknown Product")
    df_articles['detail_desc'] = df_articles['detail_desc'].fillna("")
    df_articles['colour_group_name'] = df_articles['colour_group_name'].fillna("")

    # Concatenate fields to form a rich semantic representation
    raw_texts = (
        "Product: " + df_articles['prod_name'] + ". " +
        "Description: " + df_articles['detail_desc'] + ". " +
        "Attributes: " + df_articles['colour_group_name']
    ).tolist()

    print(f"Total items to process: {len(raw_texts)}")

    # 4. Batch Inference Loop
    all_embeddings = []

    print("Start Batch Inference with Qwen2...")
    for i in tqdm(range(0, len(raw_texts), BATCH_SIZE)):
        batch_texts = raw_texts[i : i + BATCH_SIZE]

        # Tokenize inputs
        batch_dict = tokenizer(
            batch_texts,
            max_length=MAX_LENGTH,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to("cuda")

        with torch.no_grad():
            outputs = model(**batch_dict)

            # Extracting
            embeddings = cls_pooling(outputs)

            # Normalizing
            embeddings = F.normalize(embeddings, p=2, dim=1)

            # Move to CPU to save GPU memory
            all_embeddings.append(embeddings.float().cpu().numpy())

    # 5. Save Results
    final_embeddings = np.concatenate(all_embeddings, axis=0)

    # Save as a list column in the DataFrame
    df_articles['embedding'] = list(final_embeddings)

    # Keep only ID and Embedding columns for Milvus ingestion
    output_df = df_articles[['article_id', 'embedding', 'price', 'colour_group_name']]
    output_df.to_parquet(OUTPUT_PATH)

    print(f"Embeddings saved to: {OUTPUT_PATH}")
    print(f"CRITICAL NOTE: The vector dimension is {final_embeddings.shape[1]}.")
    print(f"(Please ensure your Milvus Collection is created with dim={final_embeddings.shape[1]})")

Initializing SOTA model: BAAI/bge-large-en-v1.5 on GPU...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Reading article data from: /content/drive/MyDrive/ScaleStyle_Project/data/processed/articles_parquet
transactions rows: 27306439 cols: ['article_id', 'price']
Total items to process: 105126
Start Batch Inference with Qwen2...


  0%|          | 0/411 [00:00<?, ?it/s]

Embeddings saved to: /content/drive/MyDrive/ScaleStyle_Project/data/processed/article_embeddings_bge_v2.parquet
CRITICAL NOTE: The vector dimension is 1024.
(Please ensure your Milvus Collection is created with dim=1024)


In [8]:
_df = pd.read_parquet(OUTPUT_PATH)
_df.head()

Unnamed: 0,article_id,embedding,price,colour_group_name
0,108775015,"[0.009101868, -0.031799316, 0.010383606, 0.007...",0.008133,Black
1,108775044,"[0.004814148, -0.022735596, 0.012435913, -0.00...",0.008108,White
2,108775051,"[-0.0036678314, -0.016036987, 0.009941101, -0....",0.004993,Off White
3,110065001,"[-0.021087646, -0.050109863, -0.0077171326, -0...",0.019989,Black
4,110065002,"[-0.022323608, -0.0463562, -0.010498047, -0.01...",0.018037,White


In [9]:
_df.describe()

Unnamed: 0,article_id,price
count,105126.0,104131.0
mean,698796200.0,0.028691
std,128496100.0,0.025727
min,108775000.0,0.000424
25%,617333000.0,0.01439
50%,702778000.0,0.022864
75%,796977800.0,0.03369
max,959461000.0,0.50678


In [10]:
df_articles.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_name,department_name,...,index_group_name,section_name,garment_group_name,detail_desc,department,index_crossed,color,product,price,embedding
0,108775015,Strap top,Vest top,Garment Upper body,Solid,Black,4,Dark,Black,Jersey Basic,...,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.,Jersey Basic_Jersey Basic,Ladieswear_Ladieswear,Black_Dark_Black,Strap top_Vest top_Garment Upper body,0.008133,"[0.009101868, -0.031799316, 0.010383606, 0.007..."
1,108775044,Strap top,Vest top,Garment Upper body,Solid,White,3,Light,White,Jersey Basic,...,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.,Jersey Basic_Jersey Basic,Ladieswear_Ladieswear,White_Light_White,Strap top_Vest top_Garment Upper body,0.008108,"[0.004814148, -0.022735596, 0.012435913, -0.00..."
2,108775051,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,1,Dusty Light,White,Jersey Basic,...,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.,Jersey Basic_Jersey Basic,Ladieswear_Ladieswear,Off White_Dusty Light_White,Strap top (1)_Vest top_Garment Upper body,0.004993,"[-0.0036678314, -0.016036987, 0.009941101, -0...."
3,110065001,OP T-shirt (Idro),Bra,Underwear,Solid,Black,4,Dark,Black,Clean Lingerie,...,Ladieswear,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","Clean Lingerie_Under-, Nightwear",Lingeries/Tights_Ladieswear,Black_Dark_Black,OP T-shirt (Idro)_Bra_Underwear,0.019989,"[-0.021087646, -0.050109863, -0.0077171326, -0..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,Solid,White,3,Light,White,Clean Lingerie,...,Ladieswear,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","Clean Lingerie_Under-, Nightwear",Lingeries/Tights_Ladieswear,White_Light_White,OP T-shirt (Idro)_Bra_Underwear,0.018037,"[-0.022323608, -0.0463562, -0.010498047, -0.01..."


In [13]:
OUTPUT_PATH_DETAIL = "/content/drive/MyDrive/ScaleStyle_Project/data/processed/article_embeddings_bge_detail.parquet"

output_df_detail = df_articles[['article_id', 'embedding', 'price', 'colour_group_name',"prod_name","product_type_name","department_name","detail_desc"]]
output_df_detail.to_parquet(OUTPUT_PATH_DETAIL)

print(f"Embeddings saved to: {OUTPUT_PATH_DETAIL}")

Embeddings saved to: /content/drive/MyDrive/ScaleStyle_Project/data/processed/article_embeddings_bge_detail.parquet


In [14]:
_df_detail = pd.read_parquet(OUTPUT_PATH_DETAIL)
_df_detail.head()

Unnamed: 0,article_id,embedding,price,colour_group_name,prod_name,product_type_name,department_name,detail_desc
0,108775015,"[0.009101868, -0.031799316, 0.010383606, 0.007...",0.008133,Black,Strap top,Vest top,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,"[0.004814148, -0.022735596, 0.012435913, -0.00...",0.008108,White,Strap top,Vest top,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,"[-0.0036678314, -0.016036987, 0.009941101, -0....",0.004993,Off White,Strap top (1),Vest top,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,"[-0.021087646, -0.050109863, -0.0077171326, -0...",0.019989,Black,OP T-shirt (Idro),Bra,Clean Lingerie,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,"[-0.022323608, -0.0463562, -0.010498047, -0.01...",0.018037,White,OP T-shirt (Idro),Bra,Clean Lingerie,"Microfibre T-shirt bra with underwired, moulde..."
