In [7]:
import pandas as pd
import numpy as np
import os
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from fuzzywuzzy import fuzz

# ======== SEGÉDFÜGGVÉNYEK ========
def log(msg):
    print(msg)

def load_csv_safely(path):
    try:
        return pd.read_csv(path, encoding='utf-8', on_bad_lines='skip', low_memory=False)
    except Exception:
        return pd.DataFrame()

def clean_columns(df):
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df

def fuzzy_match(name, choices, threshold=90):
    for choice in choices:
        if fuzz.ratio(str(name), str(choice)) >= threshold:
            return choice
    return None


In [8]:
# ======== ÚTVONALAK ========
A_PATH = r"C:\Users\zalma\A"
B_PATH = r"C:\Users\zalma\B"
C_PATH = r"C:\Users\zalma\C"

log("=== Adatok beolvasása kezdődik ===")


=== Adatok beolvasása kezdődik ===


In [9]:
try:
    steam = load_csv_safely(os.path.join(A_PATH, "steam.csv"))
    description = load_csv_safely(os.path.join(A_PATH, "steam_description_data.csv"))
    media = load_csv_safely(os.path.join(A_PATH, "steam_media_data.csv"))
    support = load_csv_safely(os.path.join(A_PATH, "steam_support_info.csv"))
    tags = load_csv_safely(os.path.join(A_PATH, "steamspy_tag_data.csv"))
    reqs = load_csv_safely(os.path.join(A_PATH, "steam_requirements_data.csv"))

    for df in [steam, description, media, support, tags, reqs]:
        if not df.empty:
            df = clean_columns(df)
            possible_ids = [c for c in df.columns if "appid" in c.lower()]
            if possible_ids:
                df.rename(columns={possible_ids[0]: "appid"}, inplace=True)

    a_merged = (
        steam.merge(description, on="appid", how="left")
             .merge(media, on="appid", how="left")
             .merge(support, on="appid", how="left")
             .merge(tags, on="appid", how="left")
             .merge(reqs, on="appid", how="left")
    )
    log(f"A forrás összevonva: {len(a_merged)} sor")
except Exception as e:
    log(f"Hiba az A adathalmaz betöltésekor: {e}")
    a_merged = pd.DataFrame()


A forrás összevonva: 27075 sor


In [10]:
try:
    file_path = os.path.join(B_PATH, "games.csv")
    try:
        df_b = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip', low_memory=False)
    except Exception:
        data = []
        with open(file_path, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            for row in reader:
                data.append(row)
        header = data[0]
        body = data[1:]
        df_b = pd.DataFrame(body, columns=header)

    df_b = clean_columns(df_b)
    if "appid" not in df_b.columns:
        df_b.reset_index(inplace=True)
        df_b.rename(columns={"index": "appid"}, inplace=True)
    df_b = df_b.loc[:, ~df_b.columns.duplicated()]

    if "discountdlc_count" in df_b.columns:
        new_discount = []
        new_dlc_count = []
        for val in df_b["discountdlc_count"]:
            if pd.isna(val):
                new_discount.append(None)
                new_dlc_count.append(None)
            else:
                val = str(val)
                parts = val.replace("%", "% ").split()
                if len(parts) == 2:
                    new_discount.append(parts[0])
                    new_dlc_count.append(parts[1])
                elif "%" in val:
                    new_discount.append(val.split("%")[0] + "%")
                    new_dlc_count.append(None)
                else:
                    new_discount.append(None)
                    new_dlc_count.append(val)
        df_b["discount"] = new_discount
        df_b["dlc_count"] = new_dlc_count
        df_b.drop(columns=["discountdlc_count"], inplace=True)

    df_b["appid"] = df_b["appid"].astype(str)
    log(f"B forrás betöltve: {len(df_b)} sor (games.csv)")
except Exception as e:
    log(f"Hiba a B adathalmaznál: {e}")
    df_b = pd.DataFrame()


B forrás betöltve: 111452 sor (games.csv)


In [11]:
try:
    c_files = [
        "games_march2025_cleaned.csv",
        "games_march2025_full.csv",
        "games_may2024_cleaned.csv",
        "games_may2024_full.csv",
    ]
    c_dfs = [load_csv_safely(os.path.join(C_PATH, f)) for f in c_files]
    c_dfs = [clean_columns(df) for df in c_dfs if not df.empty]
    df_c = pd.concat(c_dfs, ignore_index=True)
    df_c["appid"] = df_c["appid"].astype(str)
    log(f"C forrás betöltve, összesítve: {len(df_c)} sor")
except Exception as e:
    log(f"Hiba a C adathalmaznál: {e}")
    df_c = pd.DataFrame()


C forrás betöltve, összesítve: 356018 sor


In [12]:
for df_name, df in [("A", a_merged), ("B", df_b), ("C", df_c)]:
    if not df.empty:
        df["appid"] = df["appid"].astype(str).str.strip()
        df.drop_duplicates(subset="appid", inplace=True)
        log(f"{df_name} forrás appid-ek normalizálva és duplikátumok törölve")

datasets = {"A": a_merged, "B": df_b, "C": df_c}


A forrás appid-ek normalizálva és duplikátumok törölve
B forrás appid-ek normalizálva és duplikátumok törölve
C forrás appid-ek normalizálva és duplikátumok törölve


In [13]:
for name, df in datasets.items():
    print(f"\n--- {name} ---")
    print(f"Sorok: {len(df)}, Oszlopok: {df.shape[1]}, Memória: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"Hiányzó értékek:\n{df.isna().sum()}\n")


--- A ---
Sorok: 27075, Oszlopok: 404, Memória: 370.42 MB
Hiányzó értékek:
appid                     0
name                      0
release_date              0
english                   0
developer                 1
                      ...  
pc_requirements          13
mac_requirements         13
linux_requirements       13
minimum                  18
recommended           13057
Length: 404, dtype: int64


--- B ---
Sorok: 110326, Oszlopok: 40, Memória: 484.80 MB
Hiányzó értékek:
appid                              0
name                               0
release_date                       0
estimated_owners                   0
peak_ccu                           0
required_age                       0
price                              0
about_the_game                  6460
supported_languages                0
full_audio_languages               0
reviews                        99780
header_image                       0
website                        64264
support_url                    6

In [14]:
app_sets = {name: set(df['appid']) for name, df in datasets.items() if not df.empty}
print("AppID átfedések:")
for (name1, set1), (name2, set2) in combinations(app_sets.items(), 2):
    overlap = set1 & set2
    print(f"{name1} ∩ {name2}: {len(overlap)}")

print("\nAppID átfedések százalékban:")
for (name1, set1), (name2, set2) in combinations(app_sets.items(), 2):
    overlap = set1 & set2
    pct1 = len(overlap) / len(set1) * 100 if len(set1) > 0 else 0
    pct2 = len(overlap) / len(set2) * 100 if len(set2) > 0 else 0
    print(f"{name1} ∩ {name2}: {len(overlap)} | {pct1:.1f}% of {name1}, {pct2:.1f}% of {name2}")

if len(app_sets) == 3:
    all_overlap = set.intersection(*app_sets.values())
    pct_a = len(all_overlap) / len(app_sets['A']) * 100
    pct_b = len(all_overlap) / len(app_sets['B']) * 100
    pct_c = len(all_overlap) / len(app_sets['C']) * 100
    print(f"A ∩ B ∩ C: {len(all_overlap)} | {pct_a:.1f}% of A, {pct_b:.1f}% of B, {pct_c:.1f}% of C")


AppID átfedések:
A ∩ B: 3
A ∩ C: 24441
B ∩ C: 3

AppID átfedések százalékban:
A ∩ B: 3 | 0.0% of A, 0.0% of B
A ∩ C: 24441 | 90.3% of A, 23.4% of C
B ∩ C: 3 | 0.0% of B, 0.0% of C
A ∩ B ∩ C: 3 | 0.0% of A, 0.0% of B, 0.0% of C


In [15]:
for df in datasets.values():
    if 'genres' in df.columns:
        df['num_genres'] = df['genres'].apply(lambda x: len(str(x).split(',')) if pd.notna(x) else 0)

for df in datasets.values():
    if 'owners' in df.columns and 'estimated_owners' in df.columns:
        df['total_owners'] = df[['owners','estimated_owners']].sum(axis=1, skipna=True)


In [16]:
for name, df in datasets.items():
    df['source'] = name
merged = pd.concat(datasets.values(), ignore_index=True)
source_counts = merged['source'].value_counts()
print("\nForrás szerinti rekordszám:\n", source_counts)


  merged = pd.concat(datasets.values(), ignore_index=True)



Forrás szerinti rekordszám:
 source
B    110326
C    104490
A     27075
Name: count, dtype: int64


In [19]:
for col in ['genre', 'category', 'language', 'developer', 'publisher']:
    if col in merged.columns:
        print(f"\n--- {col} összesítés ---")
        print(merged[col].value_counts(dropna=False).head(10))



--- developer összesítés ---
developer
NaN                           214817
Choice of Games                   94
KOEI TECMO GAMES CO., LTD.        72
Ripknot Systems                   62
Laush Dmitriy Sergeevich          51
Nikita "Ghost_RUS"                50
Dexion Games                      45
RewindApp                         43
Hosted Games                      42
Blender Games                     40
Name: count, dtype: int64

--- publisher összesítés ---
publisher
NaN                    214830
Big Fish Games            212
Strategy First            136
Ubisoft                   111
THQ Nordic                 98
Square Enix                97
Sekai Project              96
Choice of Games            94
1C Entertainment           88
Dagestan Technology        88
Name: count, dtype: int64


In [20]:
nan_summary = merged.isna().sum()
print("\nHiányzó értékek összesítve:\n", nan_summary)



Hiányzó értékek összesítve:
 appid                          0
name                           3
release_date                   0
english                   214816
developer                 214817
                           ...  
median_playtime_2weeks    137401
pct_pos_total             137401
num_reviews_total         137401
pct_pos_recent            137401
num_reviews_recent        137401
Length: 438, dtype: int64


In [21]:
for col in ['genre', 'category', 'language', 'developer', 'publisher']:
    if col in merged.columns:
        print(f"\n--- {col} összesítés ---")
        print(merged[col].value_counts(dropna=False).head(10))



--- developer összesítés ---
developer
NaN                           214817
Choice of Games                   94
KOEI TECMO GAMES CO., LTD.        72
Ripknot Systems                   62
Laush Dmitriy Sergeevich          51
Nikita "Ghost_RUS"                50
Dexion Games                      45
RewindApp                         43
Hosted Games                      42
Blender Games                     40
Name: count, dtype: int64

--- publisher összesítés ---
publisher
NaN                    214830
Big Fish Games            212
Strategy First            136
Ubisoft                   111
THQ Nordic                 98
Square Enix                97
Sekai Project              96
Choice of Games            94
1C Entertainment           88
Dagestan Technology        88
Name: count, dtype: int64
