In [111]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)
pd.options.display.max_rows = 4000

df = pd.read_csv("../data/game_data.csv")
df = df.drop(
    columns=[
        "scrape_id",
        "name",
        "aliases",
        "alias",
        "beat_count",
        "developers",
        "publishers",
        #
        "main_story_polled",
        "main_+_extras_polled",
        "completionist_polled",
        "all_playstyles_polled",
        #
        "main_story_average",
        "main_+_extras_average",
        "completionist_average",
        #
        "main_story_median",
        "main_+_extras_median",
        "completionist_median",
        #
        "main_story_rushed",
        "main_+_extras_rushed",
        "completionist_rushed",
        #
        "main_story_leisure",
        "main_+_extras_leisure",
        "completionist_leisure",
    ]
)

# Rating
df = df.dropna(subset=["rating"])
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Datum vydání
df = df.dropna(subset=["na", "eu", "jp"], how="all")
df["na"] = pd.to_datetime(df["na"], errors="coerce")
df["eu"] = pd.to_datetime(df["eu"], errors="coerce")
df["jp"] = pd.to_datetime(df["jp"], errors="coerce")
df["release_date"] = df[["na", "eu", "jp"]].min(axis=1)

df["year"] = df["release_date"].dt.year
df["month"] = df["release_date"].dt.month
df["day"] = df["release_date"].dt.day
df = df.drop(columns=["na", "eu", "jp", "release_date"])


# Trvání hry
for col in df.columns:
    if "all_playstyles" in col:
        df[col] = pd.to_timedelta(df[col], errors="coerce")
        df[col] = round(df[col].dt.total_seconds() / 3600, 2)

# Vývojáři a vydavatelé
df = df.dropna(subset=["developer", "publisher"], how="all")
df["developer"].fillna(df["publisher"], inplace=True)
df["publisher"].fillna(df["developer"], inplace=True)

game_counts_developer = df["developer"].value_counts()
game_counts_publisher = df["publisher"].value_counts()

thresholds = {"small": 5, "medium": 15}


def categorize_company(size):
    if size < thresholds["small"]:
        return 1
    elif size <= thresholds["medium"]:
        return 2
    else:
        return 3


# Vytvoření nového sloupce 'developer_size' s kategoriemi
df["developer_size"] = (
    df["developer"].map(game_counts_developer).map(categorize_company)
)
df["publisher_size"] = (
    df["publisher"].map(game_counts_developer).map(categorize_company)
)

df = df.drop(columns=["developer", "publisher"])

# Platformy
df = df.dropna(subset=["platforms"])
df["platforms"] = df["platforms"].apply(ast.literal_eval)

## Tohle je tady protože existuje i žánr "Arcade"
df["platforms"] = df["platforms"].apply(
    lambda x: ["Arcade machine" if platform == "Arcade" else platform for platform in x]
)

## Nahrazení méně častých platforem za "Other"
exploded_platforms = df["platforms"].explode()
platform_counts = exploded_platforms.value_counts()
rare_platforms = platform_counts[platform_counts < 50].index
df["platforms"] = df["platforms"].apply(
    lambda x: ["Other_platform"]
    if any(platform in rare_platforms for platform in x)
    else x
)

## One hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop("platforms")), index=df.index, columns=mlb.classes_
    )
)


# Žánry
df = df.dropna(subset=["genres"])
df["genres"] = df["genres"].apply(ast.literal_eval)

## Nahrazení méně častých žánrů za "Other"
exploded_genres = df["genres"].explode()
genres_counts = exploded_genres.value_counts()
rare_genres = genres_counts[genres_counts < 50].index
df["genres"] = df["genres"].apply(
    lambda x: ["Other_genre"] if any(genre in rare_genres for genre in x) else x
)


## One hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop("genres")), index=df.index, columns=mlb.classes_
    )
)



df.isna().sum()

AttributeError: type object 'DataFrame' has no attribute 'dense'

In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

X = df.drop("rating", axis=1)
y = df["rating"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalizace dat
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Sestavení neuronové sítě
model = Sequential()
model.add(Dense(64, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation="relu"))
model.add(Dense(1))  # Výstupní vrstva

# Kompilace modelu
model.compile(optimizer="adam", loss="mean_squared_error")

# Trénování modelu
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

# Evaluace modelu
loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")

import plotly.graph_objects as go
import numpy as np

# Výpočet predikcí modelu
predictions = model.predict(X_test).flatten()

# Vytvoření grafu
fig = go.Figure()

# Přidání skutečných hodnot
fig.add_trace(go.Scatter(
    x=np.arange(len(y_test)), 
    y=y_test,
    mode='markers', 
    name='Skutečné hodnoty',
    marker=dict(color='blue', size=10, line=dict(color='DarkSlateGrey', width=2))
))

# Přidání predikcí
fig.add_trace(go.Scatter(
    x=np.arange(len(predictions)), 
    y=predictions,
    mode='markers', 
    name='Predikce',
))

# Aktualizace rozvržení grafu
fig.update_layout(
    title='Porovnání skutečných hodnot a predikcí',
    xaxis_title='Index',
    yaxis_title='Hodnota',
    legend_title='Legenda'
)

# Zobrazení grafu
fig.show()

Epoch 1/10



pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.


pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.


pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.014032538048923016
