In [59]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)
pd.options.display.max_rows = 4000

df = pd.read_csv("../data/game_data.csv")
df = df.drop(
    columns=[
        "scrape_id",
        "name",
        "aliases",
        "alias",
        "beat_count",
        "developers",
        "publishers",
        #
        "main_story_polled",
        "main_+_extras_polled",
        "completionist_polled",
        "all_playstyles_polled",
        #
        "main_story_average",
        "main_+_extras_average",
        "completionist_average",
        #
        "main_story_median",
        "main_+_extras_median",
        "completionist_median",
        #
        "main_story_rushed",
        "main_+_extras_rushed",
        "completionist_rushed",
        #
        "main_story_leisure",
        "main_+_extras_leisure",
        "completionist_leisure",
    ]
)

# Rating
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Datum vydání
df = df.dropna(subset=["na", "eu", "jp"], how="all")
df["na"] = pd.to_datetime(df["na"], errors="coerce")
df["eu"] = pd.to_datetime(df["eu"], errors="coerce")
df["jp"] = pd.to_datetime(df["jp"], errors="coerce")
df["release_date"] = df[["na", "eu", "jp"]].min(axis=1)
df = df.drop(columns=["na", "eu", "jp"])


# Trvání hry
for col in df.columns:
    if "all_playstyles" in col:
        df[col] = pd.to_timedelta(df[col], errors="coerce")
        df[col] = round(df[col].dt.total_seconds() / 3600, 2)

# Vývojáři a vydavatelé
df = df.dropna(subset=["developer", "publisher"], how="all")
df["developer"].fillna(df["publisher"], inplace=True)
df["publisher"].fillna(df["developer"], inplace=True)

game_counts_developer = df["developer"].value_counts()
game_counts_publisher = df["publisher"].value_counts()

thresholds = {"small": 5, "medium": 15}


def categorize_company(size):
    if size < thresholds["small"]:
        return 1
    elif size <= thresholds["medium"]:
        return 2
    else:
        return 3


# Vytvoření nového sloupce 'developer_size' s kategoriemi
df["developer_size"] = (
    df["developer"].map(game_counts_developer).map(categorize_company)
)
df["publisher_size"] = (
    df["publisher"].map(game_counts_developer).map(categorize_company)
)

df = df.drop(columns=["developer", "publisher"])

# Platformy
df = df.dropna(subset=["platforms"])
df["platforms"] = df["platforms"].apply(ast.literal_eval)

## Tohle je tady protože existuje i žánr "Arcade"
df["platforms"] = df["platforms"].apply(
    lambda x: ["Arcade machine" if platform == "Arcade" else platform for platform in x]
)

## Nahrazení méně častých platforem za "Other"
exploded_platforms = df["platforms"].explode()
platform_counts = exploded_platforms.value_counts()
rare_platforms = platform_counts[platform_counts < 50].index
df["platforms"] = df["platforms"].apply(
    lambda x: ["Other_platform"]
    if any(platform in rare_platforms for platform in x)
    else x
)

## One hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop("platforms")), index=df.index, columns=mlb.classes_
    )
)


# Žánry
df = df.dropna(subset=["genres"])
df["genres"] = df["genres"].apply(ast.literal_eval)

## Nahrazení méně častých žánrů za "Other"
exploded_genres = df["genres"].explode()
genres_counts = exploded_genres.value_counts()
rare_genres = genres_counts[genres_counts < 50].index
df["genres"] = df["genres"].apply(
    lambda x: ["Other_genre"] if any(genre in rare_genres for genre in x) else x
)


## One hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop("genres")), index=df.index, columns=mlb.classes_
    )
)

df

Unnamed: 0,rating,all_playstyles_average,all_playstyles_median,all_playstyles_rushed,all_playstyles_leisure,release_date,developer_size,publisher_size,Amiga,Arcade machine,...,Survival,Survival Horror,Tactical,Text,Third-Person,Top-Down,Tower Defense,Turn-Based,Virtual Reality,Visual Novel
0,0.75,9.67,8.28,5.98,18.48,2009-07-22,2,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.76,23.98,22.95,15.23,41.18,2006-05-18,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.79,29.30,24.28,17.95,51.57,2006-09-28,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.78,28.28,26.00,18.12,43.57,2007-01-18,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.73,20.52,19.68,14.02,30.68,2003-02-11,1,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,0.66,8.08,7.80,5.97,14.47,2022-11-18,2,1,0,0,...,0,0,0,0,1,0,0,0,0,0
10777,0.76,42.15,39.13,31.82,69.73,2022-10-27,2,3,0,0,...,0,0,0,0,1,0,0,0,0,0
10778,0.72,1.42,1.48,1.10,1.82,2021-09-24,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0
10779,0.76,2.70,2.32,2.00,4.08,2021-11-01,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
