In [17]:
import torch
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

Using device: cuda


In [25]:
# Load the CSV file
csv_path = Path("../data/steam_games/meta_data/games_march2025_cleaned.csv")
df = pd.read_csv(csv_path)
len(df)

89618

In [26]:
# Get descriptions
def get_description(row):
    """
    Select the best available game description from prioritized fields.

    Checks the following columns in order: 
    'detailed_description', 'about_the_game', and 'short_description'.

    Returns the first non-empty, non-null string.
    
    Args:
        row: A row from a pandas DataFrame representing a game.

    Returns:
        str or None: The best available description, or None if all are missing.
    """
    for col in ["detailed_description", "about_the_game", "short_description"]:
        if pd.notna(row[col]) and str(row[col]).strip():
            return row[col]
    return None

In [27]:
# Preprocess descriptions
df["description"] = df.apply(get_description, axis=1)
df = df[df["description"].notna()].reset_index(drop=True)
len(df)

89500

In [28]:
output_dir = Path("description_feats")
output_dir.mkdir(parents=True, exist_ok=True)

In [30]:
df = df.head(10)

In [31]:
# Encode and save
for _, row in tqdm(df.iterrows(), total=len(df), desc="Encoding descriptions"):
    appid = row["appid"]
    desc = row["description"]
    embedding = model.encode(desc, convert_to_tensor=True)
    torch.save(embedding.cpu(), output_dir / f"{appid}_description.pt")

Encoding descriptions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.90it/s]
