## Landing Zone

In [68]:
# Importing useful dependencies
import io
import ast
import time
import boto3
import requests
import mimetypes
from io import BytesIO
from botocore.client import Config
from datasets import load_dataset

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [27]:
# List existing buckets
buckets = [b["Name"] for b in s3.list_buckets()["Buckets"]]

# Function that given a name, creates a bucket
def createBucket(name, list_buckets):
    if name in list_buckets:
        print(f"Bucket '{name}' already exists!")
    else:
        s3.create_bucket(Bucket=name)
        print(f"Created bucket: {name}")

# Create a bucket named landing_zone
createBucket("landing-zone", buckets)

Bucket 'landing-zone' already exists!


In [88]:
# Create two sub-buckets inside landing_zone.
# Strictly speaking, we cannot cannot create a bucket inside another one, so we will create it like a folder.
s3.put_object(Bucket="landing-zone", Key="temporal-landing/") # Sub-bucket Temporal Landing
s3.put_object(Bucket="landing-zone", Key="persistent-landing/") # Sub-bucket Presistent Landing

{'ResponseMetadata': {'RequestId': '186836F9B95E57AD',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '186836F9B95E57AD',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1698',
   'x-ratelimit-remaining': '1698',
   'x-xss-protection': '1; mode=block',
   'date': 'Wed, 24 Sep 2025 12:15:00 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

### Temporal Landing

In [5]:
# We are going to use two datasets, one from: https://huggingface.co/datasets/FronkonGames/steam-games-dataset (123 MB)
ds1_raw = load_dataset("FronkonGames/steam-games-dataset")
# The other one from: https://huggingface.co/datasets/atalaydenknalbant/rawg-games-dataset (998 MB)
ds2_raw = load_dataset("atalaydenknalbant/rawg-games-dataset")

# Print the number of rows of each dataset
print(f"The dataset 1 contains {ds1_raw['train'].num_rows} rows")
print(f"The dataset 2 contains {ds2_raw['train'].num_rows} rows")

The dataset 1 contains 83560 rows
The dataset 2 contains 889793 rows


  table = cls._concat_blocks(blocks, axis=0)


In [6]:
# We are going to use the first 100 rows from each dataset for testing purposes
ds1 = ds1_raw['train'][0:100]
ds2 = ds2_raw['train'][0:100]

# Print the number of rows of each subdataset
print(f"The subdataset 1 contains {len(ds1['About the game'])} rows")
print(f"The subdataset 2 contains {len(ds2['description'])} rows")

The subdataset 1 contains 100 rows
The subdataset 2 contains 100 rows


In [7]:
# We are interested on Text, Image and Video data
# We can find each of these data in the following columns
# ds1: "About the game" (Text), "Header image" (Image), "Screenshots" (Image), "Movies" (Video)
# ds2: "description" (Text), "background_image" (Image), "background_image_additional" (Image), "short_screenshots" (Image)
# By combing both datasets, we assume there will be duplicates of games

**Uploading Texts**

In [89]:
def upload_strings_separately(bucket_name, client, strings, path="temporal-landing/", prefix="text"):
    for i, s in enumerate(strings, start=1):
        if not s: # skip empty strings or None
            continue
        object_name = f"{path}{prefix}_{i}.txt" # temporal-landing/text_1.txt, temporal-landing/text_2.txt ...
        client.put_object(
            Bucket=bucket_name,
            Key=object_name,
            Body=io.BytesIO(s.encode("utf-8")),
            ContentType="text/plain"
        )

# Uploading text files (combining both datasets)
upload_strings_separately("landing-zone", s3, strings = 
                          ds1['About the game'] +
                          ds2['description'],
                          path = "temporal-landing/")

**Uploading Images**

In [54]:
# The fields "Screenshots" from ds1 and "short_screenshots" from ds2 need to be cleaned first.

# ds1
ds1ss = [] # Flatten into a clean list of URLs
for item in ds1['Screenshots']:
    if not item:
        continue
    ds1ss.extend([url.strip() for url in item.split(",") if url.strip()])

# ds2
ds2ss = [] # Extract image URLs
for elem in ds2['short_screenshots']:
    if not elem:
        continue
    records = elem.split('|') # Split the string by '|'
    for rec in records:
        # Convert string to dict safely
        d = ast.literal_eval(rec)
        ds2ss.append(d['image'])

In [90]:
def upload_media_from_links(bucket_name, client, links, path="temporal-landing/", prefix="image"):
    for i, url in enumerate(links, start=1):
        if not url:
            continue
        
        # Stream download to avoid loading full file in memory
        with requests.get(url, stream=True, timeout=60) as r:
            r.raise_for_status() # check for HTTP errors
            ext = url.split('.')[-1].split('?')[0] # get file extension
            object_name = f"{path}{prefix}_{i}.{ext}"
            # This streams the request directly to MinIO without creating a full BytesIO object
            client.upload_fileobj(
                Fileobj=r.raw,
                Bucket=bucket_name,
                Key=object_name,
                ExtraArgs={"ContentType": f"{prefix}/{ext}"}
            )

In [91]:
# Uploading image files (combining both datasets)
upload_media_from_links("landing-zone", s3, links = 
                         ds1['Header image'] + ds2['background_image'], #+ ds2['background_image_additional'] + ds1ss + ds2ss,
                         path="temporal-landing/")

**Uploading Videos**

In [92]:
# Uploading video files
upload_media_from_links("landing-zone", s3, links = 
                         ds1['Movies'][0:5], # We can only upload a few videos due to MinioIO storage size
                         path="temporal-landing/", prefix = "video")

### Persistent Landing

In [93]:
# Create three sub-buckets inside persistent-landing, one per format
s3.put_object(Bucket="landing-zone", Key="persistent-landing/texts/") # Sub-bucket Text
s3.put_object(Bucket="landing-zone", Key="persistent-landing/images/") # Sub-bucket Image
s3.put_object(Bucket="landing-zone", Key="persistent-landing/videos/") # Sub-bucket Video

{'ResponseMetadata': {'RequestId': '186837157552E308',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'AAAAAA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '186837157552E308',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1698',
   'x-ratelimit-remaining': '1698',
   'x-xss-protection': '1; mode=block',
   'date': 'Wed, 24 Sep 2025 12:16:59 GMT'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ChecksumCRC32': 'AAAAAA==',
 'ChecksumType': 'FULL_OBJECT'}

In [94]:
def classify_object_by_head(client, bucket, key):
    """
    Return 'texts', 'images' or 'videos' by inspecting file ContentType.
    """
    # ask S3 for ContentType
    head = client.head_object(Bucket=bucket, Key=key)
    ct = head.get("ContentType", "")
    if ct.startswith("text/"):
        return "texts"
    elif ct.startswith("image/"):
        return "images"
    elif ct.startswith("video/"):
        return "videos"

def move_files(client, bucket,
               source_prefix="temporal-landing/",
               dest_prefix="persistent-landing/"):
    """
    Move (copy then delete) all objects under source_prefix to dest_prefix,
    routing text -> dest_prefix/texts/, image -> persistent/images/, video -> dest_prefix/videos/.
    """
    paginator = client.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.

    for page in paginator.paginate(Bucket=bucket, Prefix=source_prefix):
        for obj in page.get("Contents", []):

            if obj['Size'] == 0:
                continue
            
            src_key = obj["Key"]

            # classify
            category = classify_object_by_head(client, bucket, src_key)

            # get file extension
            ext = src_key.split('.')[-1].split('?')[0]

            # new filename = timestamp + original extension
            ts = int(time.time() * 1000)  # milliseconds
            new_filename = f"{category[0:-1]}_{ts}.{ext}"

            # build destination key
            dest_key = f"{dest_prefix}{category}/{new_filename}"

            # copy then delete
            client.copy_object(Bucket=bucket, CopySource={"Bucket": bucket, "Key": src_key}, Key=dest_key)
            client.delete_object(Bucket=bucket, Key=src_key)

            print(f"Moved: {src_key} -> {dest_key}")

In [99]:
move_files(s3, "landing-zone", "temporal-landing/", "persistent-landing/")

Moved: temporal-landing/video_9.mp4 -> persistent-landing/videos/video_1758716388463.mp4
