In [None]:
import os
import csv
import pandas as pd
import sys
from datetime import datetime

# ======== ÚTVONALAK ========
A_PATH = r"C:\Users\zalma\A"
B_PATH = r"C:\Users\zalma\B"
C_PATH = r"C:\Users\zalma\C"
OUTPUT_PATH = r"C:\Users\zalma\merge"
LOG_FILE = os.path.join(OUTPUT_PATH, "merge_log.txt")

# ======== SEGÉDFÜGGVÉNYEK ========
os.makedirs(OUTPUT_PATH, exist_ok=True)
csv.field_size_limit(2**30)

def log(msg):
    """Egyszerű naplózás időbélyeggel."""
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    print(f"{timestamp} {msg}")
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"{timestamp} {msg}\n")

def clean_columns(df):
    """Egységes oszlopnév formázás."""
    df.columns = (
        df.columns.str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
    )
    return df

def load_csv_safely(path, **kwargs):
    """CSV beolvasása biztonságosan."""
    try:
        df = pd.read_csv(path, **kwargs)
        log(f"Beolvasva: {os.path.basename(path)} ({len(df)} sor)")
        return df
    except Exception as e:
        log(f"Hiba beolvasás közben: {path} → {e}")
        return pd.DataFrame()

def combine_columns(df, col):
    """Elsőbbségi oszlopok kombinálása (C > B > A)."""
    cols = [f"{col}_a", f"{col}_b", f"{col}_c"]
    combined = pd.Series([pd.NA] * len(df))
    for c in cols:
        if c in df.columns:
            combined = combined.combine_first(df[c])
    df[col] = combined
    return df


# ======== ADATBEOLVASÁS ========
log("=== Adatok beolvasása kezdődik ===")

# --- A adatforrás ---
try:
    steam = load_csv_safely(os.path.join(A_PATH, "steam.csv"))
    description = load_csv_safely(os.path.join(A_PATH, "steam_description_data.csv"))
    media = load_csv_safely(os.path.join(A_PATH, "steam_media_data.csv"))
    support = load_csv_safely(os.path.join(A_PATH, "steam_support_info.csv"))
    tags = load_csv_safely(os.path.join(A_PATH, "steamspy_tag_data.csv"))
    reqs = load_csv_safely(os.path.join(A_PATH, "steam_requirements_data.csv"))

    for df in [steam, description, media, support, tags, reqs]:
        if not df.empty:
            df = clean_columns(df)
            possible_ids = [c for c in df.columns if "appid" in c.lower()]
            if possible_ids:
                df.rename(columns={possible_ids[0]: "appid"}, inplace=True)

    a_merged = (
        steam.merge(description, on="appid", how="left")
             .merge(media, on="appid", how="left")
             .merge(support, on="appid", how="left")
             .merge(tags, on="appid", how="left")
             .merge(reqs, on="appid", how="left")
    )
    log(f"A forrás összevonva: {len(a_merged)} sor")
except Exception as e:
    log(f"Hiba az A adathalmaz betöltésekor: {e}")
    a_merged = pd.DataFrame()

# --- B adatforrás ---
try:
    file_path = os.path.join(B_PATH, "games.csv")

    try:
        df_b = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip', low_memory=False)
    except Exception:
        data = []
        with open(file_path, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            for row in reader:
                data.append(row)
        header = data[0]
        body = data[1:]
        df_b = pd.DataFrame(body, columns=header)

    df_b = clean_columns(df_b)

    if "appid" not in df_b.columns:
        df_b.reset_index(inplace=True)
        df_b.rename(columns={"index": "appid"}, inplace=True)

    df_b = df_b.loc[:, ~df_b.columns.duplicated()]

    if "discountdlc_count" in df_b.columns:
        log("'discountdlc_count' mező szétszedése 'discount' és 'dlc_count'-ra...")
        new_discount = []
        new_dlc_count = []
        for val in df_b["discountdlc_count"]:
            if pd.isna(val):
                new_discount.append(None)
                new_dlc_count.append(None)
            else:
                val = str(val)
                parts = val.replace("%", "% ").split()
                if len(parts) == 2:
                    new_discount.append(parts[0])
                    new_dlc_count.append(parts[1])
                elif "%" in val:
                    new_discount.append(val.split("%")[0] + "%")
                    new_dlc_count.append(None)
                else:
                    new_discount.append(None)
                    new_dlc_count.append(val)
        df_b["discount"] = new_discount
        df_b["dlc_count"] = new_dlc_count
        df_b.drop(columns=["discountdlc_count"], inplace=True)

    df_b["appid"] = df_b["appid"].astype(str)

    log(f"B forrás betöltve: {len(df_b)} sor (games.csv)")

except Exception as e:
    log(f"Hiba a B adathalmaznál: {e}")
    df_b = pd.DataFrame()


# --- C adatforrás ---
try:
    c_files = [
        "games_march2025_cleaned.csv",
        "games_march2025_full.csv",
        "games_may2024_cleaned.csv",
        "games_may2024_full.csv",
    ]
    c_dfs = [load_csv_safely(os.path.join(C_PATH, f)) for f in c_files]
    c_dfs = [clean_columns(df) for df in c_dfs if not df.empty]
    df_c = pd.concat(c_dfs, ignore_index=True)
    df_c["appid"] = df_c["appid"].astype(str)
    log(f"C forrás betöltve, összesítve: {len(df_c)} sor")
except Exception as e:
    log(f"Hiba a C adathalmaznál: {e}")
    df_c = pd.DataFrame()

for df_name, df in [("A", a_merged), ("B", df_b), ("C", df_c)]:
    if not df.empty:
        df["appid"] = df["appid"].astype(str).str.strip()
        df.drop_duplicates(subset="appid", inplace=True)
        log(f"{df_name} forrás appid-ek normalizálva és duplikátumok törölve")


# ======== APPID EGYESÍTÉS ========
try:
    all_appids = pd.concat([
        a_merged[["appid"]],
        df_b[["appid"]],
        df_c[["appid"]],
    ], ignore_index=True).drop_duplicates()

    log(f"Összes egyedi appid: {len(all_appids)}")
except Exception as e:
    log(f"Hiba az APPID egyesítésnél: {e}")
    all_appids = pd.DataFrame(columns=["appid"])

# ======== MERGE (A + B + C) ========
d = all_appids \
    .merge(a_merged.add_suffix("_a"), left_on="appid", right_on="appid_a", how="left") \
    .merge(df_b.add_suffix("_b"), left_on="appid", right_on="appid_b", how="left") \
    .merge(df_c.add_suffix("_c"), left_on="appid", right_on="appid_c", how="left")

log("Merge megtörtént (A + B + C)")

# ======== DIAGNOSZTIKA A MERGE UTÁN ========
dupes = d["appid"].duplicated().sum()
if dupes > 0:
    log(f"Duplikált appid-k száma a merge után: {dupes}")
    d.drop_duplicates(subset="appid", keep="first", inplace=True)
    log("Duplikált appid-k eltávolítva a merge után")

for field in ["appid", "name", "release_date", "price", "estimated_owners"]:
    if field in d.columns:
        missing = d[field].isna().sum()
        log(f"Hiányzó értékek [{field}]: {missing}")


# ======== FORRÁS MEZŐ HOZZÁADÁSA ========
def detect_source(row):
    if pd.notna(row.get("appid_c")): return "C"
    if pd.notna(row.get("appid_b")): return "B"
    if pd.notna(row.get("appid_a")): return "A"
    return "unknown"

d["source"] = d.apply(detect_source, axis=1)
log("Forrásoszlop ('source') hozzáadva")

# ======== AUTOMATIKUS OSZLOP-ÖSSZEVONÁS (C > B > A) ========
log("Oszlopok automatikus egyesítése (C > B > A) kezdődik...")

base_names = set()
for col in d.columns:
    if col.endswith(("_a", "_b", "_c")):
        base_names.add(col[:-2]) 

for base in sorted(base_names):
    d = combine_columns(d, base)

log(f"Automatikusan összevont oszlopok száma: {len(base_names)}")

cols_to_drop = [c for c in d.columns if c.endswith(("_a", "_b", "_c"))]
d.drop(columns=cols_to_drop, inplace=True, errors="ignore")

log(f"Régi forrásoszlopok eltávolítva ({len(cols_to_drop)} db)")

# ======== SÉMA SZERINTI OSZLOPOK HOZZÁADÁSA ÉS RENDEZÉSE ========
log("Séma szerinti oszlopok rendezése és hiányzók pótlása...")

schema_columns = [
    "appid", "name", "release_date", "estimated_owners", "price", "required_age",
    "dlc_count", "recommendations", "notes", "website", "metacritic_score",
    "metacritic_url", "achievements", "user_score", "score_rank", "positive",
    "negative", "average_playtime_forever", "average_playtime_two_weeks",
    "median_playtime_forever", "median_playtime_two_weeks", "peak_ccu",
    "discount", "reviews",
    "langid", "lang_name", "audio",
    "tagid", "tag_name", "weight",
    "genreid", "genre_name",
    "catid", "category_name",
    "devid", "developer_name",
    "pubid", "publisher_name",
    "platid", "platform_name",
    "packid", "package_name"
]

log("Séma szerinti oszlopok rendezése és hiányzók pótlása...")

missing_cols = [col for col in schema_columns if col not in d.columns]
if missing_cols:
    d = pd.concat(
        [d, pd.DataFrame({col: pd.NA for col in missing_cols}, index=d.index)],
        axis=1
    )


ordered = [col for col in schema_columns if col in d.columns]
others = [col for col in d.columns if col not in ordered]
d = d[ordered + others]


log(f"Séma szerinti oszlopok száma: {len(d.columns)}")


# ======== OUTPUT MENTÉSE ========
output_file = os.path.join(OUTPUT_PATH, "merged_dataset_D.csv")
d.to_csv(output_file, index=False, encoding="utf-8")

log(f"Merged és tisztított adathalmaz mentve: {output_file}")
log("=== Merge folyamat sikeresen befejeződött ===")

[2025-10-09 17:01:47] === Adatok beolvasása kezdődik ===
[2025-10-09 17:01:47] Beolvasva: steam.csv (27075 sor)
[2025-10-09 17:01:48] Beolvasva: steam_description_data.csv (27334 sor)
[2025-10-09 17:01:49] Beolvasva: steam_media_data.csv (27332 sor)
[2025-10-09 17:01:49] Beolvasva: steam_support_info.csv (27136 sor)
[2025-10-09 17:01:49] Beolvasva: steamspy_tag_data.csv (29022 sor)
[2025-10-09 17:01:50] Beolvasva: steam_requirements_data.csv (27319 sor)
[2025-10-09 17:01:50] A forrás összevonva: 27075 sor
[2025-10-09 17:01:54] 'discountdlc_count' mező szétszedése 'discount' és 'dlc_count'-ra...
[2025-10-09 17:01:54] B forrás betöltve: 111452 sor (games.csv)
[2025-10-09 17:01:59] Beolvasva: games_march2025_cleaned.csv (89618 sor)
[2025-10-09 17:02:05] Beolvasva: games_march2025_full.csv (94948 sor)
[2025-10-09 17:02:10] Beolvasva: games_may2024_cleaned.csv (83646 sor)
[2025-10-09 17:02:15] Beolvasva: games_may2024_full.csv (87806 sor)
[2025-10-09 17:02:15] C forrás betöltve, összesítve:

  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined = combined.combine_first(df[c])
  combined 

[2025-10-09 17:02:53] Automatikusan összevont oszlopok száma: 436
[2025-10-09 17:02:54] Régi forrásoszlopok eltávolítva (491 db)
[2025-10-09 17:02:54] Séma szerinti oszlopok rendezése és hiányzók pótlása...
[2025-10-09 17:02:54] Séma szerinti oszlopok rendezése és hiányzók pótlása...
[2025-10-09 17:02:55] Séma szerinti oszlopok száma: 455
[2025-10-09 17:03:47] Merged és tisztított adathalmaz mentve: C:\Users\zalma\merge\merged_dataset_D.csv
[2025-10-09 17:03:47] === Merge folyamat sikeresen befejeződött ===


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

csv_path = r"C:\Users\zalma\merge\merged_dataset_D.csv"

try:
    df = pd.read_csv(csv_path)
    print("CSV loaded successfully!")
    display(df.head(10))
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

  df = pd.read_csv(csv_path)


CSV loaded successfully!


Unnamed: 0,appid,name,release_date,estimated_owners,price,required_age,dlc_count,recommendations,notes,website,...,warhammer_40k,web_publishing,werewolves,western,windows,word_game,world_war_i,world_war_ii,wrestling,zombies
0,10,Counter-Strike,2000-11-01,10000000 - 20000000,7.19,0.0,0.0,149445.0,Includes intense violence and blood.,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
1,20,Team Fortress Classic,1999-04-01,5000000 - 10000000,3.99,0.0,0.0,6454.0,Includes intense violence and blood.,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
2,30,Day of Defeat,2003-05-01,5000000 - 10000000,3.99,0.0,0.0,4032.0,,http://www.dayofdefeat.com/,...,0.0,0.0,0.0,0.0,True,0.0,5.0,122.0,0.0,0.0
3,40,Deathmatch Classic,2001-06-01,5000000 - 10000000,3.99,0.0,0.0,2113.0,,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
4,50,Half-Life: Opposing Force,1999-11-01,0.0,3.99,0.0,0.0,0.0,,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
5,60,Ricochet,2000-11-01,5000000 - 10000000,3.99,0.0,0.0,3997.0,,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
6,70,Half-Life,1998-11-08,2000000 - 5000000,7.19,0.0,1.0,91746.0,,http://www.half-life.com/,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
7,80,Counter-Strike: Condition Zero,2004-03-01,5000000 - 10000000,7.19,0.0,0.0,18327.0,,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0
8,130,Half-Life: Blue Shift,2001-06-01,2000000 - 5000000,3.99,0.0,0.0,13949.0,,,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,63.0
9,220,Half-Life 2,2004-11-16,5000000 - 10000000,7.19,0.0,1.0,151770.0,,http://www.half-life2.com,...,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,607.0
