In [1]:
import pandas as pd
import os
import zipfile
from pathlib import Path
from google.colab import files, drive
from sentence_transformers import SentenceTransformer
from pathlib import Path

In [2]:
# --- Configuration ---
COMPETITION_NAME = "h-and-m-personalized-fashion-recommendations"
# This is where we want the data to be available for our Spark code
LOCAL_DATA_DIR = Path("/content/hm_data")

# This is where the persistent data lives in your Google Drive
# NOTE: Ensure this path matches where you saved it last time!
DRIVE_MOUNT_POINT = "/content/drive"
DRIVE_PROJECT_PATH = Path(DRIVE_MOUNT_POINT) / "MyDrive/ScaleStyle_Project/data"
# --- Configuration ---
COMPETITION_NAME = "h-and-m-personalized-fashion-recommendations"
# This is where we want the data to be available for our Spark code
LOCAL_DATA_DIR = Path("/content/hm_data")

# This is where the persistent data lives in your Google Drive
# NOTE: Ensure this path matches where you saved it last time!
DRIVE_MOUNT_POINT = "/content/drive"
DRIVE_PROJECT_PATH = Path(DRIVE_MOUNT_POINT) / "MyDrive/ScaleStyle_Project/data"
ZIP_FILE_PATH = DRIVE_PROJECT_PATH / f"{COMPETITION_NAME}.zip"

In [3]:
def setup_data_from_drive():
    """
    Mounts Google Drive and prepares the dataset.
    1. Mounts Drive.
    2. Checks if the ZIP file exists in Drive.
    3. Extracts CSVs to the local Colab environment for fast access.
    """

    # 1. Mount Google Drive
    if not os.path.exists(DRIVE_MOUNT_POINT):
        print("üîπ Mounting Google Drive...")
        drive.mount(DRIVE_MOUNT_POINT)
    else:
        print("‚úÖ Google Drive is already mounted.")

    # 2. Check for the dataset in Drive
    if not ZIP_FILE_PATH.exists():
        print(f"‚ùå Error: Dataset not found at {ZIP_FILE_PATH}")
        print("   Did you save it to a different folder last time?")
        print("   If this is a fresh start, you may need to run the download script once.")
        return

    print(f"‚úÖ Found cached dataset in Drive: {ZIP_FILE_PATH}")

    # 3. Extract to local environment (Faster IO than reading from Drive directly)
    # We only extract if the target directory is empty or missing
    if not LOCAL_DATA_DIR.exists():
        print("üîπ Extracting data to local Colab environment (this improves speed)...")
        LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

        # Unzip command is faster than Python zipfile
        zip_source = str(ZIP_FILE_PATH)
        target_dir = str(LOCAL_DATA_DIR)

        # Extract specific CSVs to save space/time
        !unzip -q -o "{zip_source}" "articles.csv" "customers.csv" "transactions_train.csv" -d "{target_dir}"

        print(f"‚úÖ Extraction complete! Data is ready at: {LOCAL_DATA_DIR}")
    else:
        print(f"‚úÖ Data already extracted at {LOCAL_DATA_DIR}. Skipping extraction.")

    print(f"üìÇ Available files: {os.listdir(LOCAL_DATA_DIR)}")


# --- Execution ---
if __name__ == "__main__":
    setup_data_from_drive()

‚úÖ Google Drive is already mounted.
‚úÖ Found cached dataset in Drive: /content/drive/MyDrive/ScaleStyle_Project/data/h-and-m-personalized-fashion-recommendations.zip
‚úÖ Data already extracted at /content/hm_data. Skipping extraction.
üìÇ Available files: ['articles.csv', 'transactions_train.csv', 'customers.csv']


In [4]:
from google.colab import userdata

# 1. Securely retrieve Token (will not be displayed on screen)
try:
    token = userdata.get("GITHUB_TOKEN")
    print("‚úÖ Token retrieved successfully")
except Exception:
    print("‚ùå Token not found. Please check the Secrets panel settings on the left")

# 2. Configure repository information
username = "EthanGaoZhiyuan"
repo = "ScaleStyle"

# 3. Construct HTTPS URL with Token (Token used for authentication)
# Format: https://token@github.com/username/repo.git
clone_url = f"https://{token}@github.com/{username}/{repo}.git"

# 4. Execute clone command
# Use the -b flag to directly clone a specific branch
branch = "feat/phase2-embedding"
!git clone -b {branch} {clone_url}

# 5. Verification
if os.path.exists(repo):
    print(f"\nüéâ Code downloaded to: {repo}")
    !ls {repo}
else:
    print("\n‚ùå Clone failed. Please check Token permissions or repository existence")

‚úÖ Token retrieved successfully
fatal: destination path 'ScaleStyle' already exists and is not an empty directory.

üéâ Code downloaded to: ScaleStyle
data-pipeline	    docs	     inference-service	README.md
docker-compose.yml  gateway-service  infrastructure


In [5]:
import sys
import os

# 1. Change directory to the data-pipeline folder to access requirements.txt
# This ensures pip installs the exact versions specified in your repo.
repo_path = "/content/ScaleStyle/data-pipeline"
os.chdir(repo_path)

print(f"üìÇ Current Working Directory: {os.getcwd()}")

# 2. Install dependencies
# Using -q to keep the output clean.
print("‚¨áÔ∏è Installing dependencies from requirements.txt...")
!pip install -q -r requirements.txt

# 3. Add the project root to system path
# This allows Python to recognize 'src' as a module so we can do:
# "from src.feature_engineering import ..."
if repo_path not in sys.path:
    sys.path.append(repo_path)
    print(f"‚úÖ Added {repo_path} to system path.")

print("‚úÖ Environment setup complete.")

üìÇ Current Working Directory: /content/ScaleStyle/data-pipeline
‚¨áÔ∏è Installing dependencies from requirements.txt...
‚úÖ Added /content/ScaleStyle/data-pipeline to system path.
‚úÖ Environment setup complete.


GENERATE EMBEDDINGS WITH GTE-QWEN2-7B (SOTA) ON A100 GPU

MODEL SELECTION RATIONALE:
I selected "Alibaba-NLP/gte-Qwen2-7B-instruct" for the following reasons:
1. SOTA Performance: It currently holds top-tier positions on the MTEB(Massive Text Embedding Benchmark) leaderboard.
2. LLM-Based Architecture: Unlike traditional BERT models (approx. 100M params), this model is based on Qwen2-7B (7 Billion params). It possesses deep semantic understanding of complex fashion descriptions rather than just keyword matching.
3. Long Context Support: It supports up to 32k context window, allowing me to embed very long product details without truncation (unlike BERT's 512 limit).
4. Hardware Utilization: I'm utilizing an NVIDIA A100 (80GB), which allows me to run this heavy model efficiently in FP16 precision.

In [7]:
!pip install -U transformers



In [7]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# --- Configuration ---
MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"
BATCH_SIZE = 32
MAX_LENGTH = 8192

# Input/Output Paths (Google Drive)
INPUT_PATH = "/content/drive/MyDrive/ScaleStyle_Project/data/processed/articles_parquet"
OUTPUT_PATH = "/content/drive/MyDrive/ScaleStyle_Project/data/processed/article_embeddings_qwen2.parquet"

def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    Extracts the embedding from the last token.
    For Causal LLMs (like Qwen) used as embedding models,
    pooling the last token (EOS) is the standard practice, unlike BERT which uses [CLS].
    """
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        # Gather the hidden state corresponding to the last real token
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


if __name__ == "__main__":
    print(f"Initializing SOTA model: {MODEL_NAME} on GPU...")

    # 1. Load Model & Tokenizer
    # Note: 'trust_remote_code=True' is strictly required for Qwen2 custom architecture.
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModel.from_pretrained(
        MODEL_NAME,
        trust_remote_code=False,
        torch_dtype=torch.float16,  # Use FP16 to save VRAM and accelerate on A100
        device_map="cuda"
    )

    # 2. Load Data
    print(f"Reading article data from: {INPUT_PATH}")
    df_articles = pd.read_parquet(INPUT_PATH)

    # 3. Construct Input Text
    # Since I'm embedding documents (products), I use the raw text.
    # Instruction prompts (e.g., "Given a query...") are only needed during the retrieval phase.

    # Preprocessing
    df_articles['prod_name'] = df_articles['prod_name'].fillna("Unknown Product")
    df_articles['detail_desc'] = df_articles['detail_desc'].fillna("")
    df_articles['colour_group_name'] = df_articles['colour_group_name'].fillna("")

    # Concatenate fields to form a rich semantic representation
    raw_texts = (
        "Product: " + df_articles['prod_name'] + ". " +
        "Description: " + df_articles['detail_desc'] + ". " +
        "Attributes: " + df_articles['colour_group_name']
    ).tolist()

    print(f"Total items to process: {len(raw_texts)}")

    # 4. Batch Inference Loop
    all_embeddings = []

    print("Start Batch Inference with Qwen2...")
    for i in tqdm(range(0, len(raw_texts), BATCH_SIZE)):
        batch_texts = raw_texts[i : i + BATCH_SIZE]

        # Tokenize inputs
        batch_dict = tokenizer(
            batch_texts,
            max_length=MAX_LENGTH,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to("cuda")

        with torch.no_grad():
            outputs = model(**batch_dict)

            # Extracting
            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

            # Normalizing
            embeddings = F.normalize(embeddings, p=2, dim=1)

            # Move to CPU to save GPU memory
            all_embeddings.append(embeddings.float().cpu().numpy())

    # 5. Save Results
    final_embeddings = np.concatenate(all_embeddings, axis=0)

    # Save as a list column in the DataFrame
    df_articles['embedding'] = list(final_embeddings)

    # Keep only ID and Embedding columns for Milvus ingestion
    output_df = df_articles[['article_id', 'embedding']]
    output_df.to_parquet(OUTPUT_PATH)

    print(f"Embeddings saved to: {OUTPUT_PATH}")
    print(f"CRITICAL NOTE: The vector dimension is {final_embeddings.shape[1]}.")
    print(f"(Please ensure your Milvus Collection is created with dim={final_embeddings.shape[1]})")

Initializing SOTA model: Alibaba-NLP/gte-Qwen2-7B-instruct on GPU...


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Reading article data from: /content/drive/MyDrive/ScaleStyle_Project/data/processed/articles_parquet
Total items to process: 105126
Start Batch Inference with Qwen2...


  0%|          | 0/3286 [00:00<?, ?it/s]

Embeddings saved to: /content/drive/MyDrive/ScaleStyle_Project/data/processed/article_embeddings_qwen2.parquet
CRITICAL NOTE: The vector dimension is 3584.
(Please ensure your Milvus Collection is created with dim=3584)
