In [2]:
# === steam_description_data.csv normalizálása ===
import os, csv, re
import pandas as pd
from bs4 import BeautifulSoup

BASE_PATH   = r"C:\Users\zalma"
A_PATH      = os.path.join(BASE_PATH, "A")
OUTPUT_PATH = os.path.join(BASE_PATH, "merge")

raw_path    = os.path.join(A_PATH, "steam_description_data.csv")
clean_path  = os.path.join(OUTPUT_PATH, "steam_description_data_cleaned.csv")

os.makedirs(OUTPUT_PATH, exist_ok=True)


def reassemble_multiline_csv(in_path:str, out_path:str):
    """
    Újraösszerakja a CSV rekordokat, ha idézőjeleken belüli sortörések szétszedték a sorokat.
    Addig gyűjt, amíg az idézőjelek száma páros nem lesz.
    A közbenső sortöréseket szóközzé alakítja.
    """
    buf = []
    quote_count = 0
    with open(in_path, "r", encoding="utf-8", errors="ignore") as fin, \
         open(out_path, "w", encoding="utf-8", newline="") as fout:

        for line in fin:
            buf.append(line.rstrip("\r\n"))
            quote_count += line.count('"') - line.count(r'\"')

            if quote_count % 2 == 0:
                fout.write(" ".join(buf) + "\n")
                buf = []
                quote_count = 0

        if buf:
            fout.write(" ".join(buf) + "\n")

def strip_html_cols(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for c in cols:
        if c in df.columns:
            def clean_text(x):
                if pd.isna(x): return ""
                soup = BeautifulSoup(str(x), "html.parser")
                text = soup.get_text(" ", strip=True)
                text = re.sub(r"\s+", " ", text)
                return text.strip()
            df[c] = df[c].astype(str).apply(clean_text)
    return df

reassemble_multiline_csv(raw_path, fixed_path)

df = pd.read_csv(fixed_path, engine="python")

if "steam_appid" in df.columns and "appid" not in df.columns:
    df.rename(columns={"steam_appid": "appid"}, inplace=True)

desc_cols = ["detailed_description", "about_the_game", "short_description"]
df = strip_html_cols(df, desc_cols)

if "appid" in df.columns:
    df["appid"] = df["appid"].astype(str).str.strip()
    df = df.drop_duplicates(subset="appid").reset_index(drop=True)

df.to_csv(
    clean_path,
    index=False,
    encoding="utf-8",
    sep=";",
    quoting=csv.QUOTE_ALL
)

print(f"Normalizált leírás mentve ide: {clean_path}")
print(f"Rekordok száma: {len(df)} | Oszlopok száma: {len(df.columns)}")

Normalizált leírás mentve ide: C:\Users\zalma\merge\steam_description_data_cleaned.csv
Rekordok száma: 27334 | Oszlopok száma: 4


In [42]:
import os
import csv
import ast
import pandas as pd
import numpy as np
import logging
import json
import re
from bs4 import BeautifulSoup
from typing import Any
import matplotlib.pyplot as plt
import warnings
import sys

# ======== PATHS ========
BASE_PATH = r"C:\Users\zalma"
A_PATH = os.path.join(BASE_PATH, "A")
B_PATH = os.path.join(BASE_PATH, "B")
C_PATH = os.path.join(BASE_PATH, "C")
OUTPUT_PATH = os.path.join(BASE_PATH, "merge")

# ======== LOGGING CONFIGURATION ========
os.makedirs(OUTPUT_PATH, exist_ok=True)
LOG_FILE = os.path.join(OUTPUT_PATH, "merge_log.txt")

logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.FileHandler(LOG_FILE, encoding="utf-8"), logging.StreamHandler()],
)

# ======== HELPER FUNCTIONS ========
def load_csv_safely(path: str, **kwargs: Any) -> pd.DataFrame:
    """
    Betölt egy CSV fájlt, hiba esetén üres DataFrame-et ad vissza.
    """
    try:
        df = pd.read_csv(path, **kwargs)
        logging.info(f"Loaded: {os.path.basename(path)} ({len(df)} rows)")
        return df
    except Exception as e:
        logging.error(f"Error loading {path}: {e}")
        return pd.DataFrame()


def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardizálja a DataFrame oszlopneveit: levágja a szóközöket, kisbetűssé alakítja,
    és helyettesíti a szóközöket és kötőjeleket alulvonással.
    """
    df.columns = (
        df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")
    )
    return df


def fill_missing_from_source(D: pd.DataFrame, src: pd.DataFrame) -> pd.DataFrame:
    """
    Kitölti a hiányzó értékeket a D DataFrame-ben egy forrás (src) adatai alapján.
    Az appid oszlop alapján merge-öl, a közös oszlopokat balról tölti.
    """
    src = src.copy()
    src["appid"] = src["appid"].astype(str)

    common_cols = [col for col in src.columns if col in D.columns]

    merged = D.merge(
        src[common_cols],
        on="appid",
        how="left",
        suffixes=("", "_src")
    )

    for col in common_cols:
        if col != "appid":
            merged[col] = merged[col].combine_first(merged[f"{col}_src"])
            merged.drop(columns=[f"{col}_src"], inplace=True)

    return merged

def clean_html_entities(text: str) -> str:
    """Eltávolítja a HTML tageket és dekódolja az entitásokat (pl. &reg; → ®)."""
    if pd.isna(text):
        return ""
    soup = BeautifulSoup(str(text), "html.parser")
    cleaned = soup.get_text(" ", strip=True)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned



# ======== SOURCE LOADING FUNCTIONS ========
def load_source_a(a_path: str) -> pd.DataFrame:
    """
    Betölti az A forrást (Steam CSV fájlok), megtisztítja az oszlopneveket,
    és merge-eli a különböző fájlokat egy DataFrame-be.
    """
    steam = load_csv_safely(os.path.join(a_path, "steam.csv"))
    description = load_csv_safely(os.path.join(a_path, "steam_description_data_cleaned.csv"))
    media = load_csv_safely(os.path.join(a_path, "steam_media_data.csv"))
    support = load_csv_safely(os.path.join(a_path, "steam_support_info.csv"))
    tags = load_csv_safely(os.path.join(a_path, "steamspy_tag_data.csv"))
    reqs = load_csv_safely(os.path.join(a_path, "steam_requirements_data.csv"))

    for df in [steam, description, media, support, tags, reqs]:
        if not df.empty:
            df = clean_columns(df)
            possible_ids = [c for c in df.columns if "appid" in c.lower()]
            if possible_ids:
                df.rename(columns={possible_ids[0]: "appid"}, inplace=True)

    merged = (
        steam.merge(description, on="appid", how="left")
        .merge(media, on="appid", how="left")
        .merge(support, on="appid", how="left")
        .merge(tags, on="appid", how="left")
        .merge(reqs, on="appid", how="left")
    )
    logging.info(f"A source merged: {len(merged)} rows")
    return merged


def load_source_b(base_path: str) -> pd.DataFrame:
    """
    Betölti a B forrást JSON fájlból, előkészíti Pandas DataFrame-re,
    és beállítja a numerikus és logikai oszlopok típusait.
    """
    file_path = os.path.join(base_path, "games.json")

    if not os.path.exists(file_path):
        logging.error(f"File not found: {file_path}")
        return pd.DataFrame()

    with open(file_path, "r", encoding="utf-8") as f:
        dataset = json.load(f)

    records = []
    for appID, game in dataset.items():
        fields = [
            "name",
            "release_date",
            "estimated_owners",
            "price",
            "required_age",
            "dlc_count",
            "detailed_description",
            "short_description",
            "about_the_game",
            "reviews",
            "header_image",
            "website",
            "support_url",
            "support_email",
            "windows",
            "mac",
            "linux",
            "metacritic_score",
            "metacritic_url",
            "user_score",
            "positive",
            "negative",
            "score_rank",
            "achievements",
            "recommendations",
            "notes",
            "average_playtime_forever",
            "average_playtime_2weeks",
            "median_playtime_forever",
            "median_playtime_2weeks",
            "peak_ccu",
        ]

        record = {key: game.get(key) for key in fields}
        record["appid"] = str(appID)

        record["packages"] = game.get("packages", [])
        record["developers"] = game.get("developers", [])
        record["publishers"] = game.get("publishers", [])
        record["categories"] = game.get("categories", [])
        record["genres"] = game.get("genres", [])
        record["supported_languages"] = game.get("supported_languages", [])
        record["full_audio_languages"] = game.get("full_audio_languages", [])
        record["screenshots"] = game.get("screenshots", [])
        record["movies"] = game.get("movies", [])
        tags = game.get("tags", {})
        if isinstance(tags, dict):
            record["tags"] = tags
        else:
            record["tags"] = {}


        records.append(record)

    df_b = pd.DataFrame(records)

    if "release_date" in df_b.columns:
        df_b["release_date"] = pd.to_datetime(df_b["release_date"], errors="coerce")
        df_b["release_date"] = df_b["release_date"].dt.strftime("%Y-%m-%d")
        df_b["release_date"] = df_b["release_date"].replace("NaT", None)

    df_b_exploded = df_b.explode("packages").dropna(subset=["packages"])

    numeric_cols = [
        "metacritic_score",
        "user_score",
        "positive",
        "negative",
        "achievements",
        "recommendations",
        "price",
        "required_age",
        "dlc_count",
        "average_playtime_forever",
        "average_playtime_2weeks",
        "median_playtime_forever",
        "median_playtime_2weeks",
        "peak_ccu",
    ]
    
    packages_df = pd.json_normalize(df_b.explode("packages")["packages"])
    
    for col in numeric_cols:
        if col in df_b.columns:
            df_b[col] = pd.to_numeric(df_b[col], errors="coerce")

    bool_cols = ["windows", "mac", "linux"]
    for col in bool_cols:
        if col in df_b.columns:
            df_b[col] = df_b[col].astype(bool)

    logging.info(f"B source loaded from JSON: {len(df_b)} rows")
    return df_b


def load_source_c(c_path: str) -> pd.DataFrame:
    """
    Betölti a C forrást több CSV fájlból, megtisztítja az oszlopneveket,
    és egyesíti az adatokat egy DataFrame-be.
    """
    c_files = [
        "games_march2025_cleaned.csv",
        "games_march2025_full.csv",
        "games_may2024_cleaned.csv",
        "games_may2024_full.csv",
    ]
    c_dfs = [load_csv_safely(os.path.join(c_path, f)) for f in c_files]
    c_dfs = [clean_columns(df) for df in c_dfs if not df.empty]
    df_c = pd.concat(c_dfs, ignore_index=True)
    df_c["appid"] = df_c["appid"].astype(str)
    logging.info(f"C source combined: {len(df_c)} rows")
    return df_c


# ======== MERGE FUNCTION ========
def merge_sources(a: pd.DataFrame, b: pd.DataFrame, c: pd.DataFrame, columns_to_merge: list[str] = None) -> pd.DataFrame:
    """
    Összefésüli az A, B, C forrásokat AppID alapján.
    C → B → A prioritással tölti ki a hiányzó adatokat.

    Paraméter:
        columns_to_merge: ha meg van adva, csak ezeket az oszlopokat (és appid-t) mergeli.
    """
    logging.info("Merging sources with C→B→A priority...")

    for df in [a, b, c]:
        if not df.empty:
            df["appid"] = df["appid"].astype(str).str.strip()
            df.drop_duplicates(subset="appid", inplace=True)

    if columns_to_merge:
        keep_cols = ["appid"] + [col for col in columns_to_merge if col in a.columns or col in b.columns or col in c.columns]
        a = a[[col for col in keep_cols if col in a.columns]]
        b = b[[col for col in keep_cols if col in b.columns]]
        c = c[[col for col in keep_cols if col in c.columns]]
        logging.info(f"Using subset of columns for merge: {keep_cols}")

    columns = list(dict.fromkeys(
        sum([df.columns.tolist() for df in [a, b, c] if not df.empty], [])
    ))

    all_appids = pd.concat([a[["appid"]], b[["appid"]], c[["appid"]]], ignore_index=True).drop_duplicates()

    D = pd.DataFrame(columns=columns)
    D["appid"] = all_appids["appid"]

    for src in [c, b, a]:
        if not src.empty:
            D = fill_missing_from_source(D, src)

    logging.info(f"Merge complete ({len(D)} rows, {len(columns)} columns)")
    return D



def finalize_sources(D, a, b, c):
    """
    Hozzáad egy 'sources' oszlopot a D (merged_master) DataFrame-hez,
    ami jelzi, hogy a sor melyik eredeti datasetből származik.
    """
    a_ids = set(a["appid"]) if not a.empty else set()
    b_ids = set(b["appid"]) if not b.empty else set()
    c_ids = set(c["appid"]) if not c.empty else set()

    sources = []
    for appid in D["appid"]:
        src = []
        if appid in c_ids:
            src.append("C")
        if appid in b_ids:
            src.append("B")
        if appid in a_ids:
            src.append("A")
        sources.append(",".join(src))

    D["sources"] = sources
    return D

# ======== Segédfüggvények a normalizáláshoz ========
def normalize_screenshots_column(df: pd.DataFrame, source_name: str):
    """
    Kivonatolja a screenshots oszlopot (ha létezik) és visszaadja a thumbnail URL-eket.
    Működik dict/list/str típusokra is.
    """
    thumb_dict = {}

    if "screenshots" not in df.columns:
        return thumb_dict

    for appid, val in df[["appid", "screenshots"]].itertuples(index=False):
        thumb_urls = []

        if val is None:
            continue
        if isinstance(val, float) and np.isnan(val):
            continue

        try:
            data = ast.literal_eval(val) if isinstance(val, str) else val
        except Exception:
            continue

        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    thumb = item.get("path_thumbnail") or item.get("thumb")
                    if thumb:
                        thumb_urls.append(thumb.strip())
                elif isinstance(item, str):
                    pass

        thumb_urls = [u for u in thumb_urls if isinstance(u, str) and u.startswith("http")]
        thumb_urls = list(dict.fromkeys(thumb_urls))

        thumb_dict[str(appid)] = thumb_urls

    logging.info(f"Normalized thumbnail screenshots for source {source_name} ({len(thumb_dict)} items)")
    return thumb_dict

def process_screenshots(a, b, c):
    """
    Normalizálja a screenshots oszlopokat, 
    visszaadja a thumbnail dict-eket.
    """
    a_thumb = normalize_screenshots_column(a, "A")
    b_thumb = normalize_screenshots_column(b, "B")
    c_thumb = normalize_screenshots_column(c, "C")
    return a_thumb, b_thumb, c_thumb


def normalize_movies_column(df: pd.DataFrame, source_name: str):
    '''
    Normalizálja a 'movies' oszlopot:
    - movies_thumbnail: a 'thumbnail' URL-ek
    - movies_480: a 'webm.480' URL-ek
    - movies_max: a 'webm.max' URL-ek
    '''
    thumb_dict = {}
    m480_dict = {}
    mmax_dict = {}

    if "movies" not in df.columns:
        return thumb_dict, m480_dict, mmax_dict

    for appid, val in df[["appid", "movies"]].itertuples(index=False):
        thumbs = []
        webm_480 = []
        webm_max = []

        if val is None:
            continue
        if isinstance(val, float) and np.isnan(val):
            continue

        try:
            data = ast.literal_eval(val) if isinstance(val, str) else val
        except Exception:
            continue

        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    t = item.get("thumbnail")
                    if t:
                        thumbs.append(t.strip())
                    w480 = item.get("webm", {}).get("480")
                    if w480:
                        webm_480.append(w480.strip())
                    wmax = item.get("webm", {}).get("max")
                    if wmax:
                        webm_max.append(wmax.strip())

        thumb_dict[str(appid)] = thumbs
        m480_dict[str(appid)] = webm_480
        mmax_dict[str(appid)] = mmax_dict.get(str(appid), []) + mmax_dict.get(str(appid), [])

    logging.info(f"Normalized movies for source {source_name} ({len(thumb_dict)} items)")
    return thumb_dict, m480_dict, mmax_dict

def dedup_join(urls):
    '''
    Egy lista vagy tuple URL-t megtisztít duplikátumoktól és vesszővel összefűzi őket.
    '''
    if not urls or not isinstance(urls, (list, tuple)):
        return ""
    return ", ".join(list(dict.fromkeys(urls)))

def merge_and_finalize(a: pd.DataFrame, b: pd.DataFrame, c: pd.DataFrame, columns_to_merge: list[str] = None) -> pd.DataFrame:
    '''
    Három forrás-DataFrame (A, B, C) egyesítése és véglegesítése.

    - Merge-eli a forrásokat az `appid` alapján.
    - Kategória-, screenshot- és videóadatokat egyesít és átnevez.
    - Thumbnail és 480p videóoszlopokat hoz létre.
    - Eltávolítja a duplikált URL-eket (`dedup_join` segítségével).
    - Összevonja a fejlesztői, kiadói, kategória- és tag-információkat.
    '''
    D = merge_sources(a, b, c, columns_to_merge=columns_to_merge)

    if 'categories' in a.columns:
        D['categories_a'] = D['appid'].map(a.set_index('appid')['categories'])
    if 'categories' in b.columns:
        D['categories_b'] = D['appid'].map(b.set_index('appid')['categories'])
    if 'categories' in c.columns:
        D['categories_c'] = D['appid'].map(c.set_index('appid')['categories'])

    if "screenshots" in D.columns:
        D.rename(columns={"screenshots": "screenshots_full"}, inplace=True)
    a_thumb, b_thumb, c_thumb = process_screenshots(a, b, c)
    D["screenshots_thumb"] = D["appid"].map(
        lambda x: c_thumb.get(x, []) + b_thumb.get(x, []) + a_thumb.get(x, [])
    )

    if "movies" in D.columns:
        D.rename(columns={"movies": "movies_max"}, inplace=True)

    a_thumb_m, a_480, a_max = normalize_movies_column(a, "A")
    b_thumb_m, b_480, b_max = normalize_movies_column(b, "B")
    c_thumb_m, c_480, c_max = normalize_movies_column(c, "C")

    D["movies_thumbnail"] = D["appid"].map(lambda x: c_thumb_m.get(x, []) + b_thumb_m.get(x, []) + a_thumb_m.get(x, []))
    D["movies_480"] = D["appid"].map(lambda x: c_480.get(x, []) + b_480.get(x, []) + a_480.get(x, []))
        

    for col in ["screenshots_thumb", "movies_thumbnail", "movies_480"]:
        D[col] = D[col].apply(dedup_join)

    D = finalize_sources(D, a, b, c)

    D = merge_developers_publishers(D)
    D = merge_categories(D)

    tags_df = merge_tags_column(D, a, b, c)

    tags_collapsed = (
        tags_df.groupby("appid")
        .apply(lambda x: [{"tag_name": t, "weight": w} for t, w in zip(x["tag_name"], x["weight"])])
        .reset_index(name="tags")
    )
        
    D = D.merge(tags_collapsed, on="appid", how="left")

    # --- HTML/entitás tisztítása a leírás mezőkben ---
    for col in ["detailed_description", "about_the_game", "short_description"]:
        if col in D.columns:
            D[col] = D[col].astype(str).apply(clean_html_entities)

    if "owners" in D.columns and "estimated_owners" in D.columns:
        D["estimated_owners"] = D["estimated_owners"].combine_first(D["owners"])
        D.drop(columns=["owners"], inplace=True)
    elif "owners" in D.columns:
        D.rename(columns={"owners": "estimated_owners"}, inplace=True)
    
    return D



def flatten_values(vals):
    """Lapítja a listákat / stringként tárolt listákat egy sima listává."""
    flat = []
    for v in vals:
        if isinstance(v, str):
            v = v.strip()
            if v.startswith("[") and v.endswith("]"):
                try:
                    sublist = ast.literal_eval(v)
                    if isinstance(sublist, list):
                        flat.extend([str(s).strip() for s in sublist if pd.notna(s)])
                        continue
                except Exception:
                    pass
        flat.append(str(v).strip())
    return list(dict.fromkeys(flat))

def combine_cols(row: pd.Series, cols: list[str]) -> str:
    """
    Több oszlopból származó értékeket kombinál egyetlen, duplikátummentes stringgé.

    - Kinyeri az értékeket a megadott oszlopokból.
    - Támogatja a listákat, NumPy tömböket és skalárokat is.
    - A duplikátumokat eltávolítja és vesszővel elválasztva adja vissza.
    """
    vals = []
    for col in cols:
        val = row.get(col, None)
        if val is None:
            continue
        if isinstance(val, (list, np.ndarray)):
            vals.extend(flatten_values(val))
        else:
            vals.extend(flatten_values([val]))
    return ", ".join(list(dict.fromkeys(vals)))


def merge_developers_publishers(D: pd.DataFrame) -> pd.DataFrame:
    """
    Összevonja a fejlesztői és kiadói oszlopokat, eltávolítva a duplikált neveket.

    - A 'developer' és 'developers' oszlopokból egyesített 'developers' oszlopot hoz létre.
    - A 'publisher' és 'publishers' oszlopokból egyesített 'publishers' oszlopot hoz létre.
    - Az eredeti ('developer', 'publisher') oszlopokat eltávolítja.
    """
    D["developers"] = D.apply(lambda row: combine_cols(row, ["developer", "developers"]), axis=1)
    D["publishers"] = D.apply(lambda row: combine_cols(row, ["publisher", "publishers"]), axis=1)

    for col in ["developer", "publisher"]:
        if col in D.columns:
            D.drop(columns=[col], inplace=True)

    return D

def parse_categories(val) -> list[str]:
    """
    Kategóriaértékek egységes listává alakítása.

    - Kezeli a listákat, NumPy tömböket, stringeket és None értékeket.
    - Tisztítja az üres vagy NaN értékeket.
    - Felismeri a stringként tárolt listákat és a pontosvesszővel tagolt formátumokat.
    """
    if val is None:
        return []
    if isinstance(val, (float, np.floating)) and np.isnan(val):
        return []
    if isinstance(val, (list, np.ndarray)):
        return [str(v).strip() for v in val if isinstance(v, str) and v.strip()]
    if isinstance(val, str):
        val = val.strip()
        if not val:
            return []
        if val.startswith("[") and val.endswith("]"):
            try:
                parsed = ast.literal_eval(val)
                if isinstance(parsed, list):
                    return [str(v).strip() for v in parsed if isinstance(v, str) and v.strip()]
            except Exception:
                pass
        if ";" in val:
            return [v.strip() for v in val.split(";") if v.strip()]
        return [val]
    return []

def combine_categories(row: pd.Series) -> str:
    """
    Egy sor kategóriaoszlopait (A, B, C) kombinálja egyetlen, duplikátummentes stringgé.
    """
    cats_a = parse_categories(row.get("categories_a", row.get("categories", None)))
    cats_b = parse_categories(row.get("categories_b", None))
    cats_c = parse_categories(row.get("categories_c", None))

    merged = []
    seen_lower = set()

    for c in cats_a + cats_b + cats_c:
        cl = c.lower()
        if cl not in seen_lower:
            merged.append(c)
            seen_lower.add(cl)

    return ", ".join(merged)

def merge_categories(D: pd.DataFrame) -> pd.DataFrame:
    """
    A források kategóriaoszlopait egyesíti egységes 'categories' oszlopba.

    - A 'categories_a', 'categories_b', 'categories_c' oszlopokat kombinálja.
    - Duplikátumokat kiszűri, kisbetű-érzéketlen módon.
    - Eltávolítja a felesleges kategóriaoszlopokat.
    """
    category_cols = [c for c in D.columns if "categor" in c.lower()]
    D["categories"] = D.apply(combine_categories, axis=1)

    for col in category_cols:
        if col != "categories":
            D.drop(columns=[col], inplace=True, errors="ignore")

    return D


def merge_tags_column(D: pd.DataFrame, a: pd.DataFrame, b: pd.DataFrame, c: pd.DataFrame) -> pd.DataFrame:
    tags_a_dict = {}
    if 'tags' in a.columns:
        for appid, tags_str in zip(a['appid'], a['tags']):
            if isinstance(tags_str, str):
                tags_list = [t.strip() for t in tags_str.split(",") if t.strip()]
                tags_a_dict[str(appid)] = {t: 1 for t in tags_list} 

    tags_b_dict = {}
    if 'tags' in b.columns:
        for appid, tags_json in zip(b['appid'], b['tags']):
            if isinstance(tags_json, dict):
                tags_b_dict[str(appid)] = tags_json

    tags_c_dict = {}
    if 'tags' in c.columns:
        for appid, tags_str in zip(c['appid'], c['tags']):
            if isinstance(tags_str, str):
                try:
                    tags_dict = ast.literal_eval(tags_str)
                    if isinstance(tags_dict, dict):
                        tags_c_dict[str(appid)] = tags_dict
                except:
                    continue

    tag_rows = []
    for appid in D['appid']:
        tag_dict = {}
        tag_dict.update(tags_a_dict.get(str(appid), {}))
        tag_dict.update(tags_b_dict.get(str(appid), {}))
        tag_dict.update(tags_c_dict.get(str(appid), {}))

        for t, w in tag_dict.items():
            tag_rows.append({"appid": appid, "tag_name": t, "weight": w})

    tags_df = pd.DataFrame(tag_rows)
    return tags_df


def save_merged(D, path):
    output_file = os.path.join(path, "merged_master.csv")
    D.to_csv(output_file, index=False, encoding="utf-8-sig")
    return output_file


# ======== MAIN ========
def main():
    warnings.filterwarnings("ignore", category=FutureWarning)
    logging.info("=== Starting merge process ===")
    a = load_source_a(A_PATH)
    a_output_file = os.path.join(OUTPUT_PATH, "A_merged.csv")
    a.to_csv(a_output_file, index=False, encoding="utf-8")

    b = load_source_b(B_PATH)

    b.to_csv(os.path.join(OUTPUT_PATH, "B_full.csv"), index=False, encoding="utf-8")
    logging.info("B full DataFrame written to CSV")
    
    c = load_source_c(C_PATH)

    D = merge_and_finalize(a, b, c)

    #D = merge_and_finalize(a, b, c, columns_to_merge = ["owners"])

    output_file = save_merged(D, OUTPUT_PATH)
    logging.info(f"Merged master table saved to: {output_file}")
    
    logging.info("=== Merge process successfully completed ===")

if __name__ == "__main__":
    main()

[2025-11-02 11:37:29] INFO: === Starting merge process ===
[2025-11-02 11:37:29] INFO: Loaded: steam.csv (27075 rows)
[2025-11-02 11:37:31] INFO: Loaded: steam_description_data_cleaned.csv (27334 rows)
[2025-11-02 11:37:32] INFO: Loaded: steam_media_data.csv (27332 rows)
[2025-11-02 11:37:32] INFO: Loaded: steam_support_info.csv (27136 rows)
[2025-11-02 11:37:33] INFO: Loaded: steamspy_tag_data.csv (29022 rows)
[2025-11-02 11:37:34] INFO: Loaded: steam_requirements_data.csv (27319 rows)
[2025-11-02 11:37:34] INFO: A source merged: 27075 rows
[2025-11-02 11:37:53] INFO: B source loaded from JSON: 111452 rows
[2025-11-02 11:38:07] INFO: B full DataFrame written to CSV
[2025-11-02 11:38:18] INFO: Loaded: games_march2025_cleaned.csv (89618 rows)
[2025-11-02 11:38:28] INFO: Loaded: games_march2025_full.csv (94948 rows)
[2025-11-02 11:38:36] INFO: Loaded: games_may2024_cleaned.csv (83646 rows)
[2025-11-02 11:38:45] INFO: Loaded: games_may2024_full.csv (87806 rows)
[2025-11-02 11:38:45] INFO:

In [54]:
import os
import csv
import ast
import pandas as pd
import numpy as np
import logging
import json
import re
from bs4 import BeautifulSoup
from typing import Any
import matplotlib.pyplot as plt
import warnings
import sys

# ======== PATHS ========
BASE_PATH = r"C:\Users\zalma"
D_PATH = os.path.join(BASE_PATH, "merge")
OUTPUT_PATH = os.path.join(BASE_PATH, "split")

# ======== LOGGING CONFIGURATION ========
os.makedirs(OUTPUT_PATH, exist_ok=True)
LOG_FILE = os.path.join(OUTPUT_PATH, "merge_log.txt")

logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.FileHandler(LOG_FILE, encoding="utf-8"), logging.StreamHandler()],
)

# ======== HELPER FUNCTIONS ========
def load_csv_safely(path: str, **kwargs: Any) -> pd.DataFrame:
    """
    Betölt egy CSV fájlt, hiba esetén üres DataFrame-et ad vissza.
    """
    try:
        df = pd.read_csv(path, **kwargs)
        logging.info(f"Loaded: {os.path.basename(path)} ({len(df)} rows)")
        return df
    except Exception as e:
        logging.error(f"Error loading {path}: {e}")
        return pd.DataFrame()


# ======== Segédfüggvények a splittelt táblákhoz ========
def create_media_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    """
    Létrehozza a media táblát a merged_master-ből.
    
    """
    media_cols = ["appid", "header_image"]
    media_df = master_df[[c for c in media_cols if c in master_df.columns]].copy()
    
    media_df = media_df.dropna(subset=["header_image"]).reset_index(drop=True)
    
    media_df.insert(0, "mediaid", range(1, len(media_df)+1))
    
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "media.csv")
        media_df.to_csv(path, index=False)
        logging.info(f"Saved 'media.csv' ({len(media_df)} rows) to {output_dir}")
    
    return media_df

def join_urls(x) -> str:
    """
    Lista vagy string URL-eket egységes, vesszővel elválasztott stringgé alakít.

    - Ha lista, akkor elemeit összefűzi ', ' elválasztóval.
    - Ha már string, változatlanul visszaadja.
    - Egyéb esetben üres stringet ad vissza.
    """
    if isinstance(x, list):
        return ", ".join(x)
    elif isinstance(x, str):
        return x
    return ""


def create_screenshots_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    """
    Létrehozza a screenshots táblát a master DataFrame-ből.

    - Kiválasztja az 'appid', 'screenshots_full' és 'screenshots_thumb' oszlopokat.
    - A listákat stringgé alakítja (`join_urls` segítségével).
    - Eltávolítja az üres sorokat.
    - Hozzáad egy automatikus 'screenshotid' azonosítót.
    - CSV-fájlba menti az eredményt.
    """
    cols = ["appid"]
    for c in ["screenshots_full", "screenshots_thumb"]:
        if c in master_df.columns:
            cols.append(c)

    df = master_df[cols].copy()

    for c in ["screenshots_full", "screenshots_thumb"]:
        if c in df.columns:
            df[c] = df[c].apply(join_urls)

    df = df[
        (df.get("screenshots_full", "") != "") |
        (df.get("screenshots_thumb", "") != "")
    ].reset_index(drop=True)

    df.insert(0, "screenshotid", range(1, len(df) + 1))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "screenshots.csv")
        df.to_csv(path, index=False)
        logging.info(f"Saved 'screenshots.csv' ({len(df)} rows) to {output_dir}")

    return df


def create_movies_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    """
    Létrehozza a 'movies' táblát a master DataFrame-ből.

    - Kiválasztja az 'appid', 'movies_max', 'movies_thumbnail' és 'movies_480' oszlopokat.
    - A listákat stringgé alakítja (`join_urls` segítségével).
    - Csak azokat a sorokat tartja meg, ahol legalább egy URL szerepel.
    - Hozzáad egy automatikus 'movieid' azonosítót.
    - (Opcionálisan) CSV-fájlba menti az eredményt.

    Visszatér: a videókat tartalmazó DataFrame.
    """
    cols = ["appid"]
    for c in ["movies_max", "movies_thumbnail", "movies_480"]:
        if c in master_df.columns:
            cols.append(c)

    df = master_df[cols].copy()

    for c in ["movies_max", "movies_thumbnail", "movies_480"]:
        if c in df.columns:
            df[c] = df[c].apply(join_urls)

    # Csak azok maradjanak, ahol legalább egy oszlop nem üres
    df = df[
        (df.get("movies_max", "") != "") |
        (df.get("movies_thumbnail", "") != "") |
        (df.get("movies_480", "") != "")
    ].reset_index(drop=True)

    df.insert(0, "movieid", range(1, len(df) + 1))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "movies.csv")
        df.to_csv(path, index=False)
        logging.info(f"Saved 'movies.csv' ({len(df)} rows) to {output_dir}")

    return df

def create_support_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    """
    Létrehozza a support táblát a merged_master-ből.
    Tartalmazza:
      - supportid (1-től generált)
      - appid
      - support_url
      - support_email
    """
    cols = ["appid"]
    for c in ["support_url", "support_email"]:
        if c in master_df.columns:
            cols.append(c)

    df = master_df[cols].copy()

    for c in ["support_url", "support_email"]:
        if c in df.columns:
            df[c] = df[c].fillna("").astype(str)

    df = df[(df.get("support_url", "") != "") | (df.get("support_email", "") != "")].reset_index(drop=True)

    df.insert(0, "supportid", range(1, len(df) + 1))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "support.csv")
        df.to_csv(path, index=False)
        logging.info(f"Saved 'support.csv' ({len(df)} rows) to {output_dir}")

    return df

def clean_requirements_text(text):
    if not text or pd.isna(text):
        return ""
    soup = BeautifulSoup(str(text), "html.parser")

    for br in soup.find_all("br"):
        br.replace_with(" ")

    for li in soup.find_all("li"):
        li.replace_with(f"{li.get_text()}, ")

    cleaned = soup.get_text(separator=" ").strip()

    cleaned = re.sub(r'\s+', ' ', cleaned)

    cleaned = re.sub(r',\s*$', '', cleaned)

    cleaned = re.sub(r'^[\)\("\'\s,]+', '', cleaned)

    cleaned = re.sub(r'(?i)^(minimum|recommended)[:\s-]*', '', cleaned).strip()

    return cleaned


def split_min_rec(text):
    """
    Szétválasztja a minimum és recommended részt a stringből.
    Kis-/nagybetűt normalizál, ha a minimumban benne van a recommended, szétvágja.
    """
    if not text or pd.isna(text):
        return "", ""
    text = str(text).strip()
    parts = re.split(r"(?i)Recommended[:\s]*", text, maxsplit=1)
    min_part = parts[0].strip() if parts else ""
    rec_part = parts[1].strip() if len(parts) > 1 else ""
    return min_part, rec_part


def create_requirements_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    rows = []

    for _, r in master_df.iterrows():
        appid = r['appid']

        # --- Windows (pc_requirements) ---
        pc_val = r.get('pc_requirements', "")
        if pd.notna(pc_val) and str(pc_val).strip():
            try:
                val_dict = ast.literal_eval(pc_val)
                win_min = clean_requirements_text(val_dict.get('minimum', ""))
                win_rec = clean_requirements_text(val_dict.get('recommended', ""))
            except Exception:
                text = clean_requirements_text(pc_val)
                win_min, win_rec = split_min_rec(text)

            if win_min:
                win_min, extra_rec = split_min_rec(win_min)
                if win_min:
                    rows.append({'appid': appid, 'os': 'windows', 'type': 'minimum', 'requirements': win_min})
                if extra_rec:
                    rows.append({'appid': appid, 'os': 'windows', 'type': 'recommended', 'requirements': extra_rec})

            if win_rec:
                rows.append({'appid': appid, 'os': 'windows', 'type': 'recommended', 'requirements': win_rec})


        # --- Mac ---
        val = r.get('mac_requirements', "")
        if pd.notna(val):
            val_str = str(val).strip()
            if val_str and val_str not in ["[]", "{}", "nan", "None"]:
                try:
                    val_dict = ast.literal_eval(val)
                    min_val = val_dict.get('minimum', "")
                    rec_val = val_dict.get('recommended', "")
                except Exception:
                    min_val = val
                    rec_val = ""

                min_val = clean_requirements_text(min_val)
                rec_val = clean_requirements_text(rec_val)

                if min_val:
                    min_val, extra_rec = split_min_rec(min_val)
                    if min_val:
                        rows.append({'appid': appid, 'os': 'mac', 'type': 'minimum', 'requirements': min_val})
                    if extra_rec:
                        rows.append({'appid': appid, 'os': 'mac', 'type': 'recommended', 'requirements': extra_rec})
                if rec_val:
                    rows.append({'appid': appid, 'os': 'mac', 'type': 'recommended', 'requirements': rec_val})

        # --- Linux ---
        val = r.get('linux_requirements', "")
        if pd.notna(val):
            val_str = str(val).strip()
            if val_str and val_str not in ["[]", "{}", "nan", "None"]:
                try:
                    val_dict = ast.literal_eval(val)
                    min_val = val_dict.get('minimum', "")
                    rec_val = val_dict.get('recommended', "")
                except Exception:
                    min_val = val
                    rec_val = ""

                min_val = clean_requirements_text(min_val)
                rec_val = clean_requirements_text(rec_val)

                if min_val:
                    min_val, extra_rec = split_min_rec(min_val)
                    if min_val:
                        rows.append({'appid': appid, 'os': 'linux', 'type': 'minimum', 'requirements': min_val})
                    if extra_rec:
                        rows.append({'appid': appid, 'os': 'linux', 'type': 'recommended', 'requirements': extra_rec})
                if rec_val:
                    rows.append({'appid': appid, 'os': 'linux', 'type': 'recommended', 'requirements': rec_val})



    df_req = pd.DataFrame(rows)
    if not df_req.empty:
        df_req.insert(0, 'reqid', range(1, len(df_req)+1))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "requirements.csv")
        df_req.to_csv(path, index=False)
        logging.info(f"Saved 'requirements.csv' ({len(df_req)} rows) to {output_dir}")

    return df_req


def create_genres_flat(master_df: pd.DataFrame, output_dir: str = None):
    """
    Létrehozza a game_genre és genres táblákat:
    - game_genre: appid + genreid
    - genres: genreid + genre_name (eredeti genres mező)
    """
    rows = []

    for _, row in master_df.iterrows():
        appid = row["appid"]
        genres_raw = row.get("genres", "")

        text = str(genres_raw).strip()
        if text in ["", "[]", "['']"]:
            continue

        rows.append({"appid": appid, "genre_name": text})

    df_flat = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)

    df_flat.insert(1, "genreid", range(1, len(df_flat)+1))

    game_genre_df = df_flat[['appid', 'genreid']].copy()
    genres_df = df_flat[['genreid', 'genre_name']].copy()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        genres_path = os.path.join(output_dir, "genres.csv")
        game_genre_path = os.path.join(output_dir, "game_genre.csv")
        genres_df.to_csv(genres_path, index=False)
        game_genre_df.to_csv(game_genre_path, index=False)
        logging.info(f"Saved 'genres.csv' ({len(genres_df)} rows) to {output_dir}")
        logging.info(f"Saved 'game_genre.csv' ({len(game_genre_df)} rows) to {output_dir}")

    return genres_df, game_genre_df

def create_platforms_flat(master_df: pd.DataFrame, output_dir: str = None):
    """
    Létrehozza a game_platform és platforms táblákat:
    - game_platform: appid + platid
    - platforms: platid + windows/linux/mac logikai mezők
    """
    rows = []

    for _, row in master_df.iterrows():
        appid = row["appid"]
        windows = bool(row.get("windows", False))
        linux = bool(row.get("linux", False))
        mac = bool(row.get("mac", False))

        rows.append({
            "appid": appid,
            "windows": windows,
            "linux": linux,
            "mac": mac
        })

    df_flat = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)

    df_flat.insert(1, "platid", range(1, len(df_flat)+1))

    game_platform_df = df_flat[['appid', 'platid']].copy()
    platforms_df = df_flat[['platid', 'windows', 'linux', 'mac']].copy()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        platforms_path = os.path.join(output_dir, "platforms.csv")
        game_platform_path = os.path.join(output_dir, "game_platform.csv")
        platforms_df.to_csv(platforms_path, index=False)
        game_platform_df.to_csv(game_platform_path, index=False)
        logging.info(f"Saved 'platforms.csv' ({len(platforms_df)} rows) to {output_dir}")
        logging.info(f"Saved 'game_platform.csv' ({len(game_platform_df)} rows) to {output_dir}")

    return platforms_df, game_platform_df

def clean_packages(master_df: pd.DataFrame, output_dir: str):
    rows_game_package = []
    rows_packages = []
    rows_sub_package = []

    packid_counter = 1

    for _, row in master_df.iterrows():
        appid = row["appid"]
        packages_raw = row.get("packages", "")

        if pd.isna(packages_raw) or not str(packages_raw).strip():
            continue

        try:
            packages_list = ast.literal_eval(packages_raw)
        except Exception:
            continue

        if not isinstance(packages_list, list):
            continue

        for pkg in packages_list:
            title = pkg.get("title", "").strip()
            description = pkg.get("description", "").strip()

            if not title:
                continue

            rows_game_package.append({"appid": appid, "packid": packid_counter})

            rows_packages.append({"packid": packid_counter, "title": title, "description": description})

            subs = pkg.get("subs", [])
            for sub in subs:
                sub_text = sub.get("text", "").strip()
                price = sub.get("price", None)
                rows_sub_package.append({"packid": packid_counter, "sub_text": sub_text, "price": price})

            packid_counter += 1

    df_game_package = pd.DataFrame(rows_game_package)
    df_packages = pd.DataFrame(rows_packages)
    df_sub_package = pd.DataFrame(rows_sub_package)

    os.makedirs(output_dir, exist_ok=True)
    df_game_package.to_csv(os.path.join(output_dir, "game_package.csv"), index=False)
    df_packages.to_csv(os.path.join(output_dir, "packages.csv"), index=False)
    df_sub_package.to_csv(os.path.join(output_dir, "sub_package.csv"), index=False)

    logging.info(f"Saved game_package.csv ({len(df_game_package)} rows)")
    logging.info(f"Saved packages.csv ({len(df_packages)} rows)")
    logging.info(f"Saved sub_package.csv ({len(df_sub_package)} rows)")

    return df_game_package, df_packages, df_sub_package

def create_developer_tables(master_df: pd.DataFrame, output_dir: str = None):
    """
    Létrehozza a developers és game_developer táblákat úgy,
    hogy minden játékhoz egy sor tartozik, még ha több fejlesztője is van.
    - game_developer: appid + devid (1-től generált)
    - developers: devid + name (összefűzött fejlesztők)
    """
    rows = []

    for _, row in master_df.iterrows():
        appid = row["appid"]
        devs_raw = row.get("developers", "")
        if not devs_raw or pd.isna(devs_raw):
            continue
        
        if isinstance(devs_raw, list):
            dev_list = [str(d).strip() for d in devs_raw if str(d).strip()]
        else:
            dev_list = [d.strip() for d in str(devs_raw).split(",") if d.strip()]

        if not dev_list:
            continue
        
        combined_devs = ", ".join(dev_list)
        rows.append({"appid": appid, "developer_name": combined_devs})

    df_flat = pd.DataFrame(rows).reset_index(drop=True)

    df_flat.insert(1, "devid", range(1, len(df_flat)+1))

    game_developer_df = df_flat[['appid', 'devid']].copy()
    developers_df = df_flat[['devid', 'developer_name']].copy()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        developers_path = os.path.join(output_dir, "developers.csv")
        game_developer_path = os.path.join(output_dir, "game_developer.csv")
        developers_df.to_csv(developers_path, index=False)
        game_developer_df.to_csv(game_developer_path, index=False)
        logging.info(f"Saved 'developers.csv' ({len(developers_df)} rows) to {output_dir}")
        logging.info(f"Saved 'game_developer.csv' ({len(game_developer_df)} rows) to {output_dir}")

    return developers_df, game_developer_df

def create_publisher_tables(master_df: pd.DataFrame, output_dir: str = None):
    """
    Létrehozza a publishers és game_publisher táblákat úgy,
    hogy minden játékhoz egy sor tartozik, még ha több kiadója is van.
    - game_publisher: appid + pubid (1-től generált)
    - publishers: pubid + name (összefűzött kiadók)
    """
    rows = []

    for _, row in master_df.iterrows():
        appid = row["appid"]
        pubs_raw = row.get("publishers", "")
        if not pubs_raw or pd.isna(pubs_raw):
            continue
        
        if isinstance(pubs_raw, list):
            pub_list = [str(p).strip() for p in pubs_raw if str(p).strip()]
        else:
            pub_list = [p.strip() for p in str(pubs_raw).split(",") if p.strip()]

        if not pub_list:
            continue
        
        combined_pubs = ", ".join(pub_list)
        rows.append({"appid": appid, "publisher_name": combined_pubs})

    df_flat = pd.DataFrame(rows).reset_index(drop=True)

    df_flat.insert(1, "pubid", range(1, len(df_flat)+1))

    game_publisher_df = df_flat[['appid', 'pubid']].copy()
    publishers_df = df_flat[['pubid', 'publisher_name']].copy()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        publishers_path = os.path.join(output_dir, "publishers.csv")
        game_publisher_path = os.path.join(output_dir, "game_publisher.csv")
        publishers_df.to_csv(publishers_path, index=False)
        game_publisher_df.to_csv(game_publisher_path, index=False)
        logging.info(f"Saved 'publishers.csv' ({len(publishers_df)} rows) to {output_dir}")
        logging.info(f"Saved 'game_publisher.csv' ({len(game_publisher_df)} rows) to {output_dir}")

    return publishers_df, game_publisher_df

def create_categories_flat(master_df: pd.DataFrame, output_dir: str = None):
    """
    Létrehozza a game_category és categories táblákat:
    - game_category: appid + catid
    - categories: catid + name (eredeti categories mező)
    """
    rows = []

    for _, row in master_df.iterrows():
        appid = row["appid"]
        categories_raw = row.get("categories", "")

        text = str(categories_raw).strip()
        if text in ["", "[]", "['']"]:
            continue

        rows.append({"appid": appid, "name": text})

    df_flat = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)

    df_flat.insert(1, "catid", range(1, len(df_flat)+1))

    game_category_df = df_flat[['appid', 'catid']].copy()
    categories_df = df_flat[['catid', 'name']].copy()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        categories_path = os.path.join(output_dir, "categories.csv")
        game_category_path = os.path.join(output_dir, "game_category.csv")
        categories_df.to_csv(categories_path, index=False)
        game_category_df.to_csv(game_category_path, index=False)
        logging.info(f"Saved 'categories.csv' ({len(categories_df)} rows) to {output_dir}")
        logging.info(f"Saved 'game_category.csv' ({len(game_category_df)} rows) to {output_dir}")

    return categories_df, game_category_df


def create_tags_flat(master_df: pd.DataFrame, output_dir: str = None):
    rows_game_tag = []
    rows_tags = []
    tagid_counter = 1

    for _, row in master_df.iterrows():
        appid = row["appid"]
        tags_json = row.get("tags_x") or row.get("tags_y") or "[]"

        if not tags_json or pd.isna(tags_json) or tags_json in ["[]", "{}"]:
            continue

        try:
            if isinstance(tags_json, str):
                tags_eval = ast.literal_eval(tags_json)
            elif isinstance(tags_json, dict):
                tags_eval = tags_json
            elif isinstance(tags_json, list):
                tags_eval = tags_json
            else:
                continue
            
            if isinstance(tags_eval, dict):
                tags_list = [{"tag_name": k, "weight": v} for k, v in tags_eval.items()]
            elif isinstance(tags_eval, list):
                tags_list = tags_eval
            else:
                continue


            for t in tags_list:
                if isinstance(t, dict) and "tag_name" in t and "weight" in t:
                    rows_game_tag.append({"appid": appid, "tagid": tagid_counter})
                    rows_tags.append({"tagid": tagid_counter, "tag_name": t["tag_name"], "weight": t["weight"]})
                    tagid_counter += 1

        except Exception as e:
            logging.warning(f"Skipping tags for appid {appid}: {e}")
            continue

    game_tag_df = pd.DataFrame(rows_game_tag)
    tags_df = pd.DataFrame(rows_tags)

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        game_tag_df.to_csv(os.path.join(output_dir, "game_tag.csv"), index=False)
        tags_df.to_csv(os.path.join(output_dir, "tags.csv"), index=False)
        logging.info(f"Saved 'game_tag.csv' ({len(game_tag_df)} rows) to {output_dir}")
        logging.info(f"Saved 'tags.csv' ({len(tags_df)} rows) to {output_dir}")

    return game_tag_df, tags_df

def create_languages_flat(master_df: pd.DataFrame, output_dir: str = None):
    """
    Létrehozza a game_language és languages táblákat:
    - game_language: appid + langid
    - languages: langid + lang_name + audio (True, ha a nyelv szerepel a full_audio_languages-ben)
    """
    rows_game_lang = []
    rows_languages = []
    langid_counter = 1

    for _, row in master_df.iterrows():
        appid = row["appid"]

        supported_raw = row.get("supported_languages", "")
        audio_raw = row.get("full_audio_languages", "")

        # --- Parse supported_languages ---
        supported = []
        if pd.notna(supported_raw) and str(supported_raw).strip():
            try:
                val = ast.literal_eval(str(supported_raw))
                if isinstance(val, list):
                    supported = [v.strip() for v in val if isinstance(v, str) and v.strip()]
                elif isinstance(val, str):
                    supported = [v.strip() for v in val.split(",") if v.strip()]
            except Exception:
                supported = [v.strip() for v in str(supported_raw).split(",") if v.strip()]

        # --- Parse full_audio_languages ---
        full_audio = []
        if pd.notna(audio_raw) and str(audio_raw).strip():
            try:
                val = ast.literal_eval(str(audio_raw))
                if isinstance(val, list):
                    full_audio = [v.strip() for v in val if isinstance(v, str) and v.strip()]
                elif isinstance(val, str):
                    full_audio = [v.strip() for v in val.split(",") if v.strip()]
            except Exception:
                full_audio = [v.strip() for v in str(audio_raw).split(",") if v.strip()]

        # --- Kombinálás és rögzítés ---
        for lang in supported:
            has_audio = lang in full_audio
            rows_game_lang.append({"appid": appid, "langid": langid_counter})
            rows_languages.append({
                "langid": langid_counter,
                "lang_name": lang,
                "audio": has_audio
            })
            langid_counter += 1

    game_lang_df = pd.DataFrame(rows_game_lang)
    languages_df = pd.DataFrame(rows_languages)

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        game_lang_df.to_csv(os.path.join(output_dir, "game_language.csv"), index=False)
        languages_df.to_csv(os.path.join(output_dir, "languages.csv"), index=False)
        logging.info(f"Saved 'game_language.csv' ({len(game_lang_df)} rows) to {output_dir}")
        logging.info(f"Saved 'languages.csv' ({len(languages_df)} rows) to {output_dir}")

    return game_lang_df, languages_df

def create_description_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    """
    Létrehozza a 'description' táblát a master DataFrame-ből.

    Tartalmazza:
      - descriptionid (1-től generált)
      - appid
      - detailed_description
      - about_the_game
      - short_description
    """
    cols = ["appid", "detailed_description", "about_the_game", "short_description"]
    existing_cols = [c for c in cols if c in master_df.columns]

    if not existing_cols:
        logging.warning("No description columns found in master dataframe.")
        return pd.DataFrame()

    df = master_df[existing_cols].copy()

    # Csak azokat a sorokat tartsuk meg, ahol legalább egy mező nem üres
    df = df[
        (df.get("detailed_description", "") != "") |
        (df.get("about_the_game", "") != "") |
        (df.get("short_description", "") != "")
    ].reset_index(drop=True)

    # descriptionid generálása
    df.insert(0, "descriptionid", range(1, len(df) + 1))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "description.csv")
        df.to_csv(path, index=False)
        logging.info(f"Saved 'description.csv' ({len(df)} rows) to {output_dir}")

    return df

def create_game_table(master_df: pd.DataFrame, output_dir: str = None) -> pd.DataFrame:
    """
    Létrehozza a 'game.csv' táblát a master DataFrame-ből.
    Csak az appid, name és release_date mezőket tartalmazza.
    Nem szűri ki az üres neveket.
    """
    cols = ["appid", "name", "release_date","estimated_owners","required_age","price","dlc_count","recommendations","notes",
           "website","metacritic_score","metacritic_url"]
    existing_cols = [c for c in cols if c in master_df.columns]

    if "appid" not in existing_cols:
        logging.warning("Missing 'appid' column in master dataframe.")
        return pd.DataFrame()

    df = master_df[existing_cols].copy()

    # --- release_date marad ahogy van ---
    # (már egységesítve lett a merged_master.csv-ben)

    df = df.reset_index(drop=True)

    # --- recommendations -> num_recommendations átnevezés ---
    if "recommendations" in df.columns:
        df.rename(columns={"recommendations": "num_recommendations"}, inplace=True)

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, "game.csv")
        df.to_csv(path, index=False, encoding="utf-8-sig")
        logging.info(f"Saved 'game.csv' ({len(df)} rows) to {output_dir}")

    return df


def main():
    warnings.filterwarnings("ignore", category=FutureWarning)
    logging.info("=== Starting splitting process ===")
    D = load_csv_safely(os.path.join(D_PATH, "merged_master.csv"))

    '''
    media_df = create_media_table(D, output_dir=OUTPUT_PATH)
    screenshot_df = create_screenshots_table(D, output_dir=OUTPUT_PATH)
    movies_df = create_movies_table(D, output_dir=OUTPUT_PATH)
    support_df = create_support_table(D, output_dir=OUTPUT_PATH)
    requirements_df = create_requirements_table(D, output_dir=OUTPUT_PATH)
    platforms_df = create_platforms_flat(D, output_dir=OUTPUT_PATH)
    packages_df = clean_packages(D, output_dir=OUTPUT_PATH)
    developer_df = create_developer_tables(D, output_dir=OUTPUT_PATH)
    publisher_df = create_publisher_tables(D, output_dir=OUTPUT_PATH)
    genres_df = create_genres_flat(D, output_dir=OUTPUT_PATH)
    categories_df = create_categories_flat(D, output_dir=OUTPUT_PATH)
    tags_df = create_tags_flat(D, output_dir=OUTPUT_PATH)
    languages_df = create_languages_flat(D, output_dir=OUTPUT_PATH)
    '''
    #description_df = create_description_table(D, output_dir=OUTPUT_PATH)
    game_df = create_game_table(D, output_dir=OUTPUT_PATH)


    
if __name__ == "__main__":
    main()

[2025-11-02 13:12:56] INFO: === Starting splitting process ===
  df = pd.read_csv(path, **kwargs)
[2025-11-02 13:13:14] INFO: Loaded: merged_master.csv (112855 rows)
[2025-11-02 13:13:15] INFO: Saved 'game.csv' (112855 rows) to C:\Users\zalma\split


In [40]:
import pandas as pd

merged = pd.read_csv(r"C:\Users\zalma\merge\merged_master.csv", dtype=str)
game = pd.read_csv(r"C:\Users\zalma\split\game.csv", dtype=str)

missing = merged[~merged["appid"].isin(game["appid"])]
print(f"Hiányzó sorok száma: {len(missing)}")
display(missing[["appid", "name", "release_date"]])


Hiányzó sorok száma: 6


Unnamed: 0,appid,name,release_date
42051,396420,,2016-11-01
74096,1116910,,2019-09-25
79127,1365520,,2020-08-30
79245,1080790,,2019-07-12
79286,1256960,,2020-03-11
79368,1172120,,2020-01-23


In [49]:
import pandas as pd
import random

csv_path = r"C:\Users\zalma\merge\merged_master.csv"

# Betöltjük a teljes táblát
df = pd.read_csv(csv_path, encoding="utf-8")

# Csak azok a sorok, ahol kizárólag 'B' szerepel
only_b = df[df["sources"].astype(str).str.strip() == "B"]

print(f"Találatok száma: {len(only_b)}")

if only_b.empty:
    print("⚠️ Nincs olyan játék, ami kizárólag a 'B' forrásból származik.")
else:
    # Véletlenszerűen kiválasztunk egyet
    random_game = only_b.sample(1).iloc[0]
    print("\n=== Egy játék, ami csak 'B' forrásból van ===\n")
    for col, val in random_game.items():
        print(f"{col}:\n{val}\n{'-'*80}\n")


  df = pd.read_csv(csv_path, encoding="utf-8")


Találatok száma: 5731

=== Egy játék, ami csak 'B' forrásból van ===

appid:
1036970
--------------------------------------------------------------------------------

name:
HENTAI GIRL PUZZLE
--------------------------------------------------------------------------------

release_date:
2019-04-05
--------------------------------------------------------------------------------

english:
nan
--------------------------------------------------------------------------------

platforms:
nan
--------------------------------------------------------------------------------

required_age:
18.0
--------------------------------------------------------------------------------

categories:
Single-player, Steam Achievements
--------------------------------------------------------------------------------

genres:
['Casual']
--------------------------------------------------------------------------------

steamspy_tags:
nan
------------------------------------------------------------------------------

In [50]:
import pandas as pd

csv_path = 'C:/Users/zalma/merge/merged_master.csv'

try:
    df = pd.read_csv(csv_path)

    # csak az appid 220-as sor
    df["appid"] = df["appid"].astype(str).str.strip()
    record = df[df["appid"] == "10"]

    if record.empty:
        print("⚠️ Nincs ilyen appid (220) a fájlban.")
    else:
        print("✅ CSV loaded successfully!\n")
        for col, val in record.iloc[0].items():
            print(f"{col}:\n{val}\n{'-'*80}\n")

except FileNotFoundError:
    print(f"❌ Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"❌ An error occurred: {e}")


  df = pd.read_csv(csv_path)


✅ CSV loaded successfully!

appid:
10
--------------------------------------------------------------------------------

name:
Counter-Strike
--------------------------------------------------------------------------------

release_date:
2000-11-01
--------------------------------------------------------------------------------

english:
1.0
--------------------------------------------------------------------------------

platforms:
windows;mac;linux
--------------------------------------------------------------------------------

required_age:
0.0
--------------------------------------------------------------------------------

categories:
Multi-player, Online Multi-Player, Local Multi-Player, Valve Anti-Cheat enabled, PvP, Online PvP, Shared/Split Screen PvP, Family Sharing
--------------------------------------------------------------------------------

genres:
['Action']
--------------------------------------------------------------------------------

steamspy_tags:
Action;FPS;Multi

In [3]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/media.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,mediaid,appid,header_image
0,1,10,https://cdn.akamai.steamstatic.com/steam/apps/...
1,2,20,https://shared.akamai.steamstatic.com/store_it...
2,3,30,https://cdn.akamai.steamstatic.com/steam/apps/...
3,4,40,https://shared.akamai.steamstatic.com/store_it...
4,5,50,https://cdn.akamai.steamstatic.com/steam/apps/...
5,6,60,https://shared.akamai.steamstatic.com/store_it...
6,7,70,https://cdn.akamai.steamstatic.com/steam/apps/...
7,8,80,https://cdn.akamai.steamstatic.com/steam/apps/...
8,9,130,https://shared.akamai.steamstatic.com/store_it...
9,10,220,https://cdn.akamai.steamstatic.com/steam/apps/...


In [4]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/screenshots.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,screenshotid,appid,screenshots_full,screenshots_thumb
0,1,10,['https://cdn.akamai.steamstatic.com/steam/app...,https://steamcdn-a.akamaihd.net/steam/apps/10/...
1,2,20,['https://shared.akamai.steamstatic.com/store_...,https://steamcdn-a.akamaihd.net/steam/apps/20/...
2,3,30,['https://cdn.akamai.steamstatic.com/steam/app...,https://steamcdn-a.akamaihd.net/steam/apps/30/...
3,4,40,['https://shared.akamai.steamstatic.com/store_...,https://steamcdn-a.akamaihd.net/steam/apps/40/...
4,5,50,['https://cdn.akamai.steamstatic.com/steam/app...,https://steamcdn-a.akamaihd.net/steam/apps/50/...
5,6,60,['https://shared.akamai.steamstatic.com/store_...,https://steamcdn-a.akamaihd.net/steam/apps/60/...
6,7,70,['https://cdn.akamai.steamstatic.com/steam/app...,https://steamcdn-a.akamaihd.net/steam/apps/70/...
7,8,80,['https://cdn.akamai.steamstatic.com/steam/app...,https://steamcdn-a.akamaihd.net/steam/apps/80/...
8,9,130,['https://shared.akamai.steamstatic.com/store_...,https://steamcdn-a.akamaihd.net/steam/apps/130...
9,10,220,['https://cdn.akamai.steamstatic.com/steam/app...,https://steamcdn-a.akamaihd.net/steam/apps/220...


In [5]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/movies.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,movieid,appid,movies_max,movies_thumbnail,movies_480
0,1,10,[],,
1,2,20,[],,
2,3,30,[],,
3,4,40,[],,
4,5,50,[],,
5,6,60,[],,
6,7,70,[],,
7,8,80,[],,
8,9,130,[],,
9,10,220,['http://cdn.akamai.steamstatic.com/steam/apps...,https://steamcdn-a.akamaihd.net/steam/apps/904...,http://steamcdn-a.akamaihd.net/steam/apps/904/...


In [6]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/support.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,supportid,appid,support_url,support_email
0,1,10,http://steamcommunity.com/app/10,
1,2,50,https://help.steampowered.com,
2,3,70,http://steamcommunity.com/app/70,
3,4,80,http://steamcommunity.com/app/80,
4,5,130,https://help.steampowered.com,
5,6,220,http://steamcommunity.com/app/220,
6,7,240,http://steamcommunity.com/app/240,
7,8,400,http://steamcommunity.com/app/400,
8,9,440,http://steamcommunity.com/app/440,
9,10,500,http://steamcommunity.com/app/500,


In [7]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/requirements.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,reqid,appid,os,type,requirements
0,1,10,windows,minimum,"500 mhz processor, 96mb ram, 16mb video card, ..."
1,2,10,windows,recommended,"800 mhz processor, 128mb ram, 32mb+ video card..."
2,3,10,mac,minimum,"OS X Snow Leopard 10.6.3, 1GB RAM, 4GB Hard Dr..."
3,4,10,linux,minimum,"Linux Ubuntu 12.04, Dual-core from Intel or AM..."
4,5,20,windows,minimum,"500 mhz processor, 96mb ram, 16mb video card, ..."
5,6,20,windows,recommended,"800 mhz processor, 128mb ram, 32mb+ video card..."
6,7,20,mac,minimum,"OS X Snow Leopard 10.6.3, 1GB RAM, 4GB Hard Dr..."
7,8,20,linux,minimum,"Linux Ubuntu 12.04, Dual-core from Intel or AM..."
8,9,30,windows,minimum,"500 mhz processor, 96mb ram, 16mb video card, ..."
9,10,30,windows,recommended,"800 mhz processor, 128mb ram, 32mb+ video card..."


In [8]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_platform.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,platid
0,10,1
1,20,2
2,30,3
3,40,4
4,50,5
5,60,6
6,70,7
7,80,8
8,130,9
9,220,10


In [9]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/platforms.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,platid,windows,linux,mac
0,1,True,True,True
1,2,True,True,True
2,3,True,True,True
3,4,True,True,True
4,5,True,True,True
5,6,True,True,True
6,7,True,True,True
7,8,True,True,True
8,9,True,True,True
9,10,True,True,True


In [10]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_package.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,packid
0,10,1
1,20,2
2,30,3
3,40,4
4,50,5
5,60,6
6,70,7
7,80,8
8,130,9
9,220,10


In [11]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/packages.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,packid,title,description
0,1,Buy Counter-Strike,
1,2,Buy Team Fortress Classic,
2,3,Buy Day of Defeat,
3,4,Buy Deathmatch Classic,
4,5,Buy Half-Life: Opposing Force,
5,6,Buy Ricochet,
6,7,Buy Half-Life,
7,8,Buy Counter-Strike: Condition Zero,
8,9,Buy Half-Life: Blue Shift,
9,10,Buy Half-Life 2,


In [12]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/sub_package.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,packid,sub_text,price
0,1,Counter-Strike: Condition Zero - $9.99,9.99
1,1,Counter-Strike - Commercial License - $9.99,9.99
2,2,Team Fortress Classic - $4.99,4.99
3,3,Day of Defeat - $4.99,4.99
4,3,Day of Defeat - Commercial License - $4.99,4.99
5,4,Deathmatch Classic - $4.99,4.99
6,5,Half-Life: Opposing Force - $4.99,4.99
7,6,Ricochet - $4.99,4.99
8,7,Half-Life - $9.99,9.99
9,7,Half-Life - Commercial License - $9.99,9.99


In [13]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_developer.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,devid
0,10,1
1,20,2
2,30,3
3,40,4
4,50,5
5,60,6
6,70,7
7,80,8
8,130,9
9,220,10


In [14]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/developers.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,devid,developer_name
0,1,Valve
1,2,Valve
2,3,Valve
3,4,Valve
4,5,Gearbox Software
5,6,Valve
6,7,Valve
7,8,Valve
8,9,Gearbox Software
9,10,Valve


In [15]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_publisher.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,pubid
0,10,1
1,20,2
2,30,3
3,40,4
4,50,5
5,60,6
6,70,7
7,80,8
8,130,9
9,220,10


In [16]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/publishers.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,pubid,publisher_name
0,1,Valve
1,2,Valve
2,3,Valve
3,4,Valve
4,5,Valve
5,6,Valve
6,7,Valve
7,8,Valve
8,9,Valve
9,10,Valve


In [17]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_genre.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,genreid
0,10,1
1,20,2
2,30,3
3,40,4
4,50,5
5,60,6
6,70,7
7,80,8
8,130,9
9,220,10


In [18]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/genres.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,genreid,genre_name
0,1,['Action']
1,2,['Action']
2,3,['Action']
3,4,['Action']
4,5,['Action']
5,6,['Action']
6,7,['Action']
7,8,['Action']
8,9,['Action']
9,10,['Action']


In [19]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_category.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,catid
0,10,1
1,20,2
2,30,3
3,40,4
4,50,5
5,60,6
6,70,7
7,80,8
8,130,9
9,220,10


In [20]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/categories.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,catid,name
0,1,"Multi-player, Online Multi-Player, Local Multi..."
1,2,"Multi-player, Online Multi-Player, Local Multi..."
2,3,"Multi-player, Valve Anti-Cheat enabled, Family..."
3,4,"Multi-player, Online Multi-Player, Local Multi..."
4,5,"Single-player, Multi-player, Valve Anti-Cheat ..."
5,6,"Multi-player, Online Multi-Player, Valve Anti-..."
6,7,"Single-player, Multi-player, Online Multi-Play..."
7,8,"Single-player, Multi-player, Valve Anti-Cheat ..."
8,9,"Single-player, Remote Play Together, Family Sh..."
9,10,"Single-player, Steam Achievements, Steam Tradi..."


In [21]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/game_tag.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,appid,tagid
0,10,1
1,10,2
2,10,3
3,10,4
4,10,5
5,10,6
6,10,7
7,10,8
8,10,9
9,10,10


In [22]:
import pandas as pd

csv_path = 'C:/Users/zalma/split/tags.csv'

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV loaded successfully!


Unnamed: 0,tagid,tag_name,weight
0,1,Action,5472
1,2,FPS,4897
2,3,Multiplayer,3444
3,4,Shooter,3394
4,5,Classic,2822
5,6,Team-Based,1896
6,7,First-Person,1736
7,8,Competitive,1631
8,9,Tactical,1370
9,10,1990's,1231


In [9]:
"""
Nem használt de fontos függvények, majd visszarakom őket ha a merge már rendben lesz teljesen.
"""
# ======== NAME MATCHING FUNCTION ========
def analyze_name_matches(df: pd.DataFrame):
    """
    Ellenőrzi a 'name' mezőkben az egyezéseket az A, B, C források között.
    Külön vizsgálja a pontos egyezést és a formai (normalizált) egyezést.
    """

    def normalize_name(name):
        if pd.isna(name):
            return ""
        return "".join(c.lower() for c in name if c.isalnum())

    for col in ["name_a", "name_b", "name_c"]:
        if col in df.columns:
            df[f"{col}_norm"] = df[col].apply(normalize_name)

    exact_matches = (df.get("name_a") == df.get("name_b")) & (
        df.get("name_b") == df.get("name_c")
    )
    logging.info(f"Pontos névegyezések száma minden forrásban: {exact_matches.sum()}")

    partial_matches = (df.get("name_a_norm") == df.get("name_b_norm")) & (
        df.get("name_b_norm") == df.get("name_c_norm")
    )
    logging.info(
        f"Formaileg egyező nevek száma minden forrásban: {partial_matches.sum()}"
    )

    return df


# ======== MULTI-SOURCE ATTRIBUTE ANALYSIS ========
def analyze_multi_source_attribute(df: pd.DataFrame, attr: str):
    """
    Vizsgálja egy adott attribútum ('genre', 'category', 'language', stb.) értékeit
    az A, B, C forrásokban, és logolja az érvényes értékek számát,
    az egyedi értékek számát és a teljes egyezést.
    """
    cols = [f"{attr}_a", f"{attr}_b", f"{attr}_c"]
    for col in cols:
        if col not in df.columns:
            df[col] = pd.NA

    notna_counts = df[cols].notna().sum()
    logging.info(f"{attr} - érvényes értékek száma forrásonként:\n{notna_counts}")

    unique_values = {col: df[col].dropna().unique() for col in cols}
    for col, values in unique_values.items():
        logging.info(f"{col} - egyedi értékek száma: {len(values)}")

    all_equal = (df[cols[0]] == df[cols[1]]) & (df[cols[1]] == df[cols[2]])
    logging.info(f"{attr} - pontos egyezések minden forrásban: {all_equal.sum()}")

    return df


def summarize_dataset(df: pd.DataFrame, name: str, block_size: int = 5):
    """
    Részletes összegzést készít egy DataFrame-ről:
    sorok száma, oszlopok száma, memóriahasználat, oszlopok típusai.
    """
    if df.empty:
        logging.info(f"{name} dataset is empty!")
        return

    logging.info(f"=== Summary of {name} dataset ===")
    logging.info(f"Rows: {len(df)}, Columns: {len(df.columns)}")
    logging.info(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    col_types = [f"{col}: {dtype}" for col, dtype in df.dtypes.items()]

    for i in range(0, len(col_types), block_size):
        block = col_types[i : i + block_size]
        logging.info(" | ".join(block))

    logging.info("==============================")


def inspect_game_in_all_sources(merged_df: pd.DataFrame, appid: str):
    """
    Megmutatja a játék adatait, ami mindhárom forrásban szerepel.
    merged_df: a merge_sources után létrejött DataFrame
    appid: az ellenőrizni kívánt AppID
    """
    # Csak a megadott appid sorainak kiszedése
    game_row = merged_df[merged_df["appid"] == str(appid)]

    if game_row.empty:
        return

    # Kinyomtatjuk az összes oszlopot
    pd.set_option("display.max_columns", None)  # minden oszlop látszik
    pd.set_option("display.width", 200)  # ne törjön sorokra
    print(game_row.T)  # transzponáljuk, hogy oszloponként lássuk

    # Optionálisan visszaadjuk DataFrame-ként is
    return game_row


# ======== VISUALIZATION FUNCTIONS ========
def plot_release_year_histograms(df_a, df_b, df_c, output_path):
    """
    Három hisztogramot készít, amelyek az A, B, C datasetek
    játékainak megjelenési év szerinti megoszlását mutatják.
    Az elkészült ábra mentésre kerül a merge mappába.
    """

    import re
    import matplotlib.pyplot as plt
    import pandas as pd

    def extract_year(date_str):
        """Próbál többféle dátumformátumból évet kinyerni."""
        if not isinstance(date_str, str) or not date_str.strip():
            return None

        date_str = date_str.strip()

        # Ismert dátumformátumok kipróbálása
        for fmt in ("%b %d, %Y", "%Y-%m-%d", "%d %b %Y", "%Y"):
            try:
                dt = pd.to_datetime(date_str, format=fmt, errors="raise")
                return dt.year
            except Exception:
                continue

        # Ha nem ismerte fel, keressünk 4 egymást követő számjegyet (év)
        match = re.search(r"(19|20)\d{2}", date_str)
        if match:
            return int(match.group(0))

        return None

    # === Évek kinyerése mindhárom datasetből ===
    for df in [df_a, df_b, df_c]:
        if "release_date" in df.columns:
            df["year"] = df["release_date"].apply(extract_year)
        else:
            df["year"] = None

    # Szűrés érvényes évre
    df_a = df_a.dropna(subset=["year"])
    df_b = df_b.dropna(subset=["year"])
    df_c = df_c.dropna(subset=["year"])

    # === Rajzolás ===
    plt.figure(figsize=(15, 10))

    # A dataset
    plt.subplot(3, 1, 1)
    df_a["year"].value_counts().sort_index().plot(kind="bar", color="skyblue")
    plt.title("A forrás – játékok száma évenként")
    plt.xlabel("Év")
    plt.ylabel("Darabszám")

    # B dataset
    plt.subplot(3, 1, 2)
    df_b["year"].value_counts().sort_index().plot(kind="bar", color="lightgreen")
    plt.title("B forrás – játékok száma évenként")
    plt.xlabel("Év")
    plt.ylabel("Darabszám")

    # C dataset
    plt.subplot(3, 1, 3)
    df_c["year"].value_counts().sort_index().plot(kind="bar", color="salmon")
    plt.title("C forrás – játékok száma évenként")
    plt.xlabel("Év")
    plt.ylabel("Darabszám")

    plt.tight_layout()

    # === Mentés és logolás ===
    output_file = os.path.join(output_path, "release_year_histograms.png")
    plt.savefig(output_file, dpi=300)
    plt.close()
    logging.info(f"Hisztogram-összesítés mentve: {output_file}")


# ======== HELPER FUNCTIONS ========


def venn_table(a: pd.DataFrame, b: pd.DataFrame, c: pd.DataFrame, columns: list):
    """
    Készít egy elemszámos Venn-táblát az A/B/C forrásokhoz.
    Figyelembe veszi a 'genres' vagy többlépcsős oszlopokat is.
    """

    def value_set(df, col):
        if col not in df.columns:
            return set()
        s = df[col].dropna()
        all_values = set()
        for val in s:
            if isinstance(val, str):
                # Ha listaszerű string, pl. "['Action', 'Free to Play']"
                if val.startswith("[") and val.endswith("]"):
                    try:
                        val_list = eval(val)  # biztonságos, ha kontrollált adatok
                        all_values.update([str(v) for v in val_list])
                    except:
                        all_values.add(val)
                else:
                    # ";" vagy "," elválasztás esetén
                    for v in re.split(r"[;,]", val):
                        all_values.add(v.strip())
            else:
                all_values.add(str(val))
        return all_values

    venn_data = {}
    for col in columns:
        set_a = value_set(a, col)
        set_b = value_set(b, col)
        set_c = value_set(c, col)

        venn_data[col] = {
            "A_only": len(set_a - set_b - set_c),
            "B_only": len(set_b - set_a - set_c),
            "C_only": len(set_c - set_a - set_b),
            "A&B": len(set_a & set_b - set_c),
            "A&C": len(set_a & set_c - set_b),
            "B&C": len(set_b & set_c - set_a),
            "A&B&C": len(set_a & set_b & set_c),
        }

    return pd.DataFrame.from_dict(venn_data, orient="index")


def plot_venn_table(
    venn_df: pd.DataFrame, output_path: str, filename: str = "venn_table.png"
):
    """
    Vizualizálja a Venn-táblát színes hőtérképként (heatmap).
    """
    plt.figure(figsize=(12, max(4, len(venn_df) * 0.5)))
    plt.imshow(venn_df, cmap="YlGnBu", aspect="auto")
    plt.colorbar(label="Darabszám")

    plt.xticks(range(len(venn_df.columns)), venn_df.columns, rotation=45)
    plt.yticks(range(len(venn_df)), venn_df.index)
    plt.title("Elemszámos Venn diagram táblázatként")
    plt.tight_layout()

    plt.savefig(os.path.join(output_path, filename), dpi=300)
    plt.close()

        '''
    # ======== RAW SOURCES EXPORT ========
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    a.to_csv(os.path.join(OUTPUT_PATH, "source_A_raw.csv"), index=False, encoding="utf-8")
    b.to_csv(os.path.join(OUTPUT_PATH, "source_B_raw.csv"), index=False, encoding="utf-8")
    c.to_csv(os.path.join(OUTPUT_PATH, "source_C_raw.csv"), index=False, encoding="utf-8")
    logging.info("Raw source CSVs saved: source_A_raw.csv, source_B_raw.csv, source_C_raw.csv")
    '''
    
    

    
    '''
    summarize_dataset(a, "A")
    summarize_dataset(b, "B")
    summarize_dataset(c, "C")
    '''

    '''
    logging.info(f"Rows per source before merge: A={len(a)}, B={len(b)}, C={len(c)}")
    '''

    
    '''
    set_a = set(a["appid"])
    set_b = set(b["appid"])
    set_c = set(c["appid"])

    only_a = len(set_a - set_b - set_c)
    only_b = len(set_b - set_a - set_c)
    only_c = len(set_c - set_a - set_b)
    a_b = len(set_a & set_b - set_c)
    a_c = len(set_a & set_c - set_b)
    b_c = len(set_b & set_c - set_a)
    a_b_c = len(set_a & set_b & set_c)

    logging.info(f"Unique by source: A={only_a}, B={only_b}, C={only_c}")
    logging.info(f"Overlaps: A&B={a_b}, A&C={a_c}, B&C={b_c}, A&B&C={a_b_c}")
    '''

    

    '''
    # ======== VENN-DIAGRAM ========
    set_a = set(a["appid"].astype(str))
    set_b = set(b["appid"].astype(str))
    set_c = set(c["appid"].astype(str))

    plt.figure(figsize=(8, 6))
    venn3([set_a, set_b, set_c], set_labels=("A", "B", "C"))
    plt.title("AppID átfedések három forrás között")
    plt.savefig(os.path.join(OUTPUT_PATH, "venn_appid.png"), dpi=300)
    plt.close()

    # ======== OVERLAPS BY YEAR (GANTT-LIKE) ========
    plt.figure(figsize=(10, 4))

    datasets = [
        (a, "A", "cornflowerblue"),
        (b, "B", "mediumseagreen"),
        (c, "C", "orchid"),
    ]

    for i, (df, label, color) in enumerate(datasets):
        release_col = next(
            (col for col in df.columns if "release" in col and "date" in col), None
        )

        if release_col:
            if label == "B":
                df["release_year"] = pd.to_datetime(df[release_col], errors="coerce")
                if df["release_year"].isna().mean() > 0.5:
                    df["release_year"] = pd.to_datetime(
                        df[release_col].str.strip(), format="%b %d, %Y", errors="coerce"
                    )
            else:
                df["release_year"] = pd.to_datetime(df[release_col], errors="coerce")

            df["release_year"] = df["release_year"].dt.year
            df = df.dropna(subset=["release_year"])

            if not df.empty:
                min_year = int(df["release_year"].min())
                max_year = int(df["release_year"].max())
                plt.barh(
                    y=i,
                    width=max_year - min_year,
                    left=min_year,
                    height=0.4,
                    color=color,
                    alpha=0.7,
                    label=label,
                )
            else:
                print(f" {label} forrásban nincs érvényes dátum!")
        else:
            print(f" {label} forrásban nem található dátummező!")

    plt.yticks(range(len(datasets)), [label for _, label, _ in datasets])
    plt.xlabel("Kiadási év")
    plt.title("Játékok időbeli lefedettsége az A, B, C forrásokban (Gantt-szerű ábra)")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_PATH, "release_years_gantt.png"), dpi=300)
    plt.close()
    '''

    '''
    # ======== CHECKING FOR DUPLICATE APPIDS ========
    dupes = merged[merged.duplicated("appid", keep=False)].sort_values("appid")
    if not dupes.empty:
        dupes_file = os.path.join(OUTPUT_PATH, "duplicate_appid_records.csv")
        dupes.to_csv(dupes_file, index=False, encoding="utf-8")
        logging.info(
            f"{len(dupes)} duplikált AppID rekord található, mentve: {dupes_file}"
        )
    else:
        logging.info("Nincs duplikált AppID a merge után.")
    '''
    
    

IndentationError: unexpected indent (3013528029.py, line 250)