In [7]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_text as text

from tensorflow.keras.applications import ResNet50
from sklearn.preprocessing import normalize

print("TF version:", tf.__version__)


TF version: 2.10.1


In [10]:
CSV_PATH = "games.csv"
IMAGE_DIR = "images"

MAX_GAMES = 10000

IMG_SIZE = (224, 224)
BATCH_SIZE = 32   # change to 16 if GPU memory is low


In [11]:
df = pd.read_csv(CSV_PATH)
print("Original shape:", df.shape)


Original shape: (71716, 39)


In [12]:
df = df[[
    "AppID",
    "Name",
    "About the game",
    "Genres"
]]

df = df.rename(columns={
    "AppID": "appid",
    "Name": "name",
    "About the game": "about",
    "Genres": "genres"
})


In [13]:
df = df.dropna(subset=["about", "genres"])

def clean_text(text_):
    text_ = re.sub(r"<.*?>", "", text_)
    text_ = re.sub(r"\s+", " ", text_)
    return text_.strip()

df["about"] = df["about"].apply(clean_text)

df["text"] = df["about"] + " Genres: " + df["genres"]

print("After text cleaning:", df.shape)


After text cleaning: (69198, 5)


In [14]:
def has_image(appid):
    return os.path.exists(f"{IMAGE_DIR}/{appid}.jpg")

df = df[df["appid"].apply(has_image)]

# Use first 10k (you downloaded first 10k images)
df = df.iloc[:MAX_GAMES].reset_index(drop=True)

print("After image filtering:", df.shape)


After image filtering: (9736, 5)


In [15]:
def load_image(appid):
    path = tf.strings.join([
        IMAGE_DIR, "/", tf.strings.as_string(appid), ".jpg"
    ])
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img

image_ds = tf.data.Dataset.from_tensor_slices(df["appid"].values)
image_ds = image_ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
image_ds = image_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [16]:
bert_preprocess = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)

text_ds = tf.data.Dataset.from_tensor_slices(df["text"].values)
text_ds = text_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [17]:
image_base = ResNet50(
    weights="imagenet",
    include_top=False,
    pooling="avg"
)

image_base.trainable = False

image_input = tf.keras.Input(shape=(224, 224, 3))
x = image_base(image_input)
x = tf.keras.layers.Dense(512, activation="relu")(x)

image_encoder = tf.keras.Model(image_input, x)

print("Image encoder output shape:", image_encoder.output_shape)


Image encoder output shape: (None, 512)


In [18]:
bert_encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2",
    trainable=False
)

text_input = tf.keras.Input(shape=(), dtype=tf.string)
text_pre = bert_preprocess(text_input)
text_out = bert_encoder(text_pre)["pooled_output"]
text_out = tf.keras.layers.Dense(512, activation="relu")(text_out)

text_encoder = tf.keras.Model(text_input, text_out)

print("Text encoder output shape:", text_encoder.output_shape)


Text encoder output shape: (None, 512)


In [19]:
image_embeddings = []

for batch in image_ds:
    emb = image_encoder(batch, training=False)
    image_embeddings.append(emb.numpy())

image_embeddings = np.vstack(image_embeddings)

print("Image embeddings shape:", image_embeddings.shape)


Image embeddings shape: (9736, 512)


In [20]:
text_embeddings = []

for batch in text_ds:
    emb = text_encoder(batch, training=False)
    text_embeddings.append(emb.numpy())

text_embeddings = np.vstack(text_embeddings)

print("Text embeddings shape:", text_embeddings.shape)


Text embeddings shape: (9736, 512)


In [21]:
multimodal_embeddings = np.concatenate(
    [text_embeddings, image_embeddings],
    axis=1
)

fusion_layer = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(256)
])

multimodal_embeddings = fusion_layer(multimodal_embeddings).numpy()
multimodal_embeddings = normalize(multimodal_embeddings)

print("Final embeddings shape:", multimodal_embeddings.shape)


Final embeddings shape: (9736, 256)


In [22]:
np.save("game_embeddings.npy", multimodal_embeddings)
np.save("game_appids.npy", df["appid"].values)
df.to_csv("game_metadata.csv", index=False)

print("Saved: game_embeddings.npy, game_appids.npy, game_metadata.csv")


Saved: game_embeddings.npy, game_appids.npy, game_metadata.csv
