## Temporal landing

In [1]:
#!pip install datasets

In [2]:
# Importing useful dependencies
import io
import ast
import boto3
import requests
from datasets import load_dataset

In [3]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [4]:
# We are going to use two datasets, one from: https://huggingface.co/datasets/FronkonGames/steam-games-dataset (123 MB)
ds1_raw = load_dataset("FronkonGames/steam-games-dataset")
# The other one from: https://huggingface.co/datasets/atalaydenknalbant/rawg-games-dataset (998 MB)
ds2_raw = load_dataset("atalaydenknalbant/rawg-games-dataset")

# Print the number of rows of each dataset
print(f"The dataset 1 contains {ds1_raw['train'].num_rows} rows")
print(f"The dataset 2 contains {ds2_raw['train'].num_rows} rows")

The dataset 1 contains 83560 rows
The dataset 2 contains 889793 rows


In [5]:
# We are going to use the first 100 rows from each dataset for testing purposes
ds1 = ds1_raw['train'][0:100]
ds2 = ds2_raw['train'][0:100]

# Print the number of rows of each subdataset
print(f"The subdataset 1 contains {len(ds1['About the game'])} rows")
print(f"The subdataset 2 contains {len(ds2['description'])} rows")

The subdataset 1 contains 100 rows
The subdataset 2 contains 100 rows


In [6]:
# We are interested on Text, Image and Video data
# We can find each of these data in the following columns
# ds1: "About the game" (Text), "Header image" (Image), "Screenshots" (Image), "Movies" (Video)
# ds2: "description" (Text), "background_image" (Image), "background_image_additional" (Image), "short_screenshots" (Image)
# By combing both datasets, we assume there will be duplicates of games

**Uploading Texts**

In [7]:
# This function uploads each game description from strings to the given bucket_name,
# saving them as separate text files (text_1.txt, text_2.txt, …).
def upload_strings_separately(bucket_name, client, strings, path="temporal-landing/", prefix="text"):
    for i, s in enumerate(strings, start=1):
        if not s: # skip empty strings or None
            continue
        object_name = f"{path}{prefix}_{i}.txt" # temporal-landing/text_1.txt, temporal-landing/text_2.txt ...
        client.put_object(
            Bucket=bucket_name,
            Key=object_name,
            Body=io.BytesIO(s.encode("utf-8")),
            ContentType="text/plain"
        )

# Uploading text files (combining both datasets)
upload_strings_separately("landing-zone", s3, strings = 
                          ds1['About the game'] +
                          ds2['description'],
                          path = "temporal-landing/")

**Uploading Images**

In [8]:
# The fields "Screenshots" from ds1 and "short_screenshots" from ds2 need to be cleaned first.

# ds1
ds1ss = [] # Flatten into a clean list of URLs
for item in ds1['Screenshots']:
    if not item:
        continue
    ds1ss.extend([url.strip() for url in item.split(",") if url.strip()])

# ds2
ds2ss = [] # Extract image URLs
for elem in ds2['short_screenshots']:
    if not elem:
        continue
    records = elem.split('|') # Split the string by '|'
    for rec in records:
        # Convert string to dict safely
        d = ast.literal_eval(rec)
        ds2ss.append(d['image'])

In [9]:
# This function downloads URL in links as a stream and uploads it directly to the given bucket,
# saving the files with names like image_1.jpg, image_2.png, etc., while preserving their extensions.
def upload_media_from_links(bucket_name, client, links, path="temporal-landing/", prefix="image"):
    for i, url in enumerate(links, start=1):
        if not url:
            continue
        
        # Stream download to avoid loading full file in memory
        with requests.get(url, stream=True, timeout=60) as r:
            r.raise_for_status() # check for HTTP errors
            ext = url.split('.')[-1].split('?')[0] # get file extension
            object_name = f"{path}{prefix}_{i}.{ext}"
            # This streams the request directly to MinIO without creating a full BytesIO object
            client.upload_fileobj(
                Fileobj=r.raw,
                Bucket=bucket_name,
                Key=object_name,
                ExtraArgs={"ContentType": f"{prefix}/{ext}"}
            )

In [10]:
# Uploading image files (combining both datasets)
upload_media_from_links("landing-zone", s3, links = 
                         ds1['Header image'] + ds2['background_image'], #+ ds2['background_image_additional'] + ds1ss + ds2ss,
                         path="temporal-landing/")

**Uploading Videos**

In [11]:
# Uploading video files
upload_media_from_links("landing-zone", s3, links = 
                         ds1['Movies'][0:5], # We can only upload a few videos due to MinioIO storage size
                         path="temporal-landing/", prefix = "video")