In [37]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
import plotly.graph_objs as go

mlb = MultiLabelBinarizer(sparse_output=True)
pd.options.display.max_rows = 4000

df = pd.read_csv("../data/game_data.csv")
df = df.drop(
    columns=[
        "scrape_id",
        "name",
        "aliases",
        "alias",
        "beat_count",
        "developers",
        "publishers",
        #
        "main_story_polled",
        "main_+_extras_polled",
        "completionist_polled",
        "all_playstyles_polled",
        #
        "main_story_average",
        "main_+_extras_average",
        "completionist_average",
        #
        "main_story_median",
        "main_+_extras_median",
        "completionist_median",
        #
        "main_story_rushed",
        "main_+_extras_rushed",
        "completionist_rushed",
        #
        "main_story_leisure",
        "main_+_extras_leisure",
        "completionist_leisure",
    ]
)

# Rating
df = df.dropna(subset=["rating"])
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Datum vydání
df = df.dropna(subset=["na", "eu", "jp"], how="all")
df["na"] = pd.to_datetime(df["na"], errors="coerce")
df["eu"] = pd.to_datetime(df["eu"], errors="coerce")
df["jp"] = pd.to_datetime(df["jp"], errors="coerce")
df["release_date"] = df[["na", "eu", "jp"]].min(axis=1)

# df["year"] = df["release_date"].dt.year
# df["month"] = df["release_date"].dt.month
# df["day"] = df["release_date"].dt.day
df = df.drop(columns=["na", "eu", "jp", "release_date"])


# Trvání hry
for col in df.columns:
    if "all_playstyles" in col:
        df[col] = pd.to_timedelta(df[col], errors="coerce")
        df[col] = round(df[col].dt.total_seconds() / 3600, 3)

# Platformy
df = df.dropna(subset=["platforms"])
df["platforms"] = df["platforms"].apply(ast.literal_eval)

## Tohle je tady protože existuje i žánr "Arcade"
df["platforms"] = df["platforms"].apply(
    lambda x: ["Arcade machine" if platform == "Arcade" else platform for platform in x]
)

## Nahrazení méně častých platforem za "Other"
exploded_platforms = df["platforms"].explode()
platform_counts = exploded_platforms.value_counts()
rare_platforms = platform_counts[platform_counts < 50].index
df["platforms"] = df["platforms"].apply(
    lambda x: ["Other_platform"]
    if any(platform in rare_platforms for platform in x)
    else x
)

## One hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop("platforms")), index=df.index, columns=mlb.classes_
    )
)


# Žánry
df = df.dropna(subset=["genres"])
df["genres"] = df["genres"].apply(ast.literal_eval)

## Nahrazení méně častých žánrů za "Other"
exploded_genres = df["genres"].explode()
genres_counts = exploded_genres.value_counts()
rare_genres = genres_counts[genres_counts < 100].index
df["genres"] = df["genres"].apply(
    lambda x: ["Other_genre"] if any(genre in rare_genres for genre in x) else x
)


## One hot encoding
df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop("genres")), index=df.index, columns=mlb.classes_
    )
)

# df = df.drop(columns=["platforms", "genres"])

In [38]:
# Vývojáři a vydavatelé
df = df.dropna(subset=["developer", "publisher"], how="all")
df["developer"].fillna(df["publisher"], inplace=True)
df["publisher"].fillna(df["developer"], inplace=True)

from sklearn.cluster import KMeans
import plotly.graph_objs as go
import pandas as pd

# Spočítání frekvencí pro vývojáře a vydavatele
game_counts_developer = df["developer"].value_counts()
game_counts_publisher = df["publisher"].value_counts()


# Funkce pro vytvoření Elbow grafu
def elbow_plot(data, title):
    wcss = []
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, max_iter=300, n_init=10, random_state=0)
        kmeans.fit(data.values.reshape(-1, 1))
        wcss.append(kmeans.inertia_)

    fig = go.Figure(data=go.Scatter(x=list(range(1, 11)), y=wcss, mode="lines+markers"))
    fig.update_layout(title=title, xaxis_title="Počet klastrů", yaxis_title="WCSS")
    fig.show()


# Vytvoření Elbow grafu pro vývojáře
elbow_plot(game_counts_developer, "Elbow Metoda pro Vývojáře")

# Vytvoření Elbow grafu pro vydavatele
elbow_plot(game_counts_publisher, "Elbow Metoda pro Vydavatele")

# Po určení optimálního počtu klastrů pomocí grafů
# Můžete použít KMeans pro klastrování a přidání výsledků do DataFrame
optimal_clusters_developer = 3  # Změňte podle vašeho grafu
optimal_clusters_publisher = 2  # Změňte podle vašeho grafu

kmeans_developer = KMeans(n_clusters=optimal_clusters_developer, random_state=0)
developer_clusters = kmeans_developer.fit_predict(
    game_counts_developer.values.reshape(-1, 1)
)
developer_cluster_map = dict(zip(game_counts_developer.index, developer_clusters))

kmeans_publisher = KMeans(n_clusters=optimal_clusters_publisher, random_state=0)
publisher_clusters = kmeans_publisher.fit_predict(
    game_counts_publisher.values.reshape(-1, 1)
)
publisher_cluster_map = dict(zip(game_counts_publisher.index, publisher_clusters))

# Mapování klastrů zpět do původního DataFrame
df["developer_size"] = df["developer"].map(developer_cluster_map)
df["publisher_size"] = df["publisher"].map(publisher_cluster_map)

df = df.drop(columns=["developer", "publisher"])
len(df.columns)







80

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
import kerastuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

X = df.drop("rating", axis=1)
y = (df["rating"] > 0.7).astype(int)  # 1 pro hodnocení > 0.7, jinak 0

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalizace dat
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# def build_model(hp):
#     model = Sequential()
#     model.add(
#         Dense(
#             hp.Int("units", min_value=32, max_value=512, step=32),
#             activation="relu",
#             input_shape=(X_train.shape[1],),
#         )
#     )
#     model.add(
#         Dense(
#             hp.Int("units", min_value=32, max_value=512, step=32),
#             activation="relu",
#         )
#     )
#     model.add(
#         Dense(
#             hp.Int("units", min_value=32, max_value=512, step=32),
#             activation="relu",
#         )
#     )
#     model.add(
#         Dense(
#             hp.Int("units", min_value=32, max_value=512, step=32),
#             activation="relu",
#         )
#     )

#     model.add(Dense(1, activation="sigmoid"))

#     # Nastavení learning rate
#     lr = hp.Float("learning_rate", min_value=1e-4, max_value=1e-2, sampling="log")

#     model.compile(
#         optimizer=Adam(learning_rate=lr),
#         loss="binary_crossentropy",
#         metrics=["accuracy"],
#     )
#     return model


# # Nastavení tuneru
# tuner = kt.Hyperband(
#     build_model,
#     objective="val_accuracy",
#     max_epochs=10,
#     hyperband_iterations=2,
#     directory="my_dir",
#     project_name="keras_tuning",
# )

# # Spuštění ladění hyperparametrů
# tuner.search(X_train, y_train, epochs=10, validation_split=0.1)

# # Získání nejlepšího modelu a jeho hyperparametrů
# best_model = tuner.get_best_models(num_models=1)[0]
# best_hyperparameters = tuner.get_best_hyperparameters()

# print("Nejlepší hyperparametry:")
# print(best_hyperparameters[0].values)
# print("Nejlepší model:")
# print(best_model.summary())
# X_train.shape[1]


pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.


pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.


pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.



Reloading Tuner from my_dir/keras_tuning/tuner0.json
Nejlepší hyperparametry:
{'units': 352, 'learning_rate': 0.0011238400290001951, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
Nejlepší model:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 352)               28160     
                                                                 
 dense_1 (Dense)             (None, 352)               124256    
                                                                 
 dense_2 (Dense)             (None, 352)               124256    
                                                                 
 dense_3 (Dense)             (None, 352)               124256    
                                                                 
 dense_4 (Dense)             (None, 1)                 353       
                     

79

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import keras.optimizers

# Sestavení neuronové sítě pro binární klasifikaci
model = Sequential()
model.add(Dense(352, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))


model.add(Dense(1, activation="sigmoid"))  # Binární výstup

optimizer = keras.optimizers.Adam(learning_rate=0.0011238400290001951)
# Kompilace modelu
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])


# Trénování modelu s EarlyStopping
model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=32)
# Evaluace modelu
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.7198458313941956, Test Accuracy: 0.6668346524238586


In [41]:
from keras_visualizer import visualizer

# create your model here
# model = ...

visualizer(model, file_format="png", view=True)

In [42]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, log_loss
import numpy as np

# Predikce na testovací sadě
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")

# Výpočet matice záměn
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Výpočet dalších klasifikačních metrik
classification_report = classification_report(y_test, y_pred_classes)
roc_auc = roc_auc_score(y_test, y_pred)

log_loss_val = log_loss(y_test, y_pred)

# Výpis výsledků
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report)
print("\nROC AUC Score:")
print(roc_auc)
print("\nLog Loss:")
print(log_loss_val)

# Výpočet log loss


# Výpočet sensitivity a specificity z matice záměn
TN, FP, FN, TP = conf_matrix.ravel()
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

print("\nSensitivity:", sensitivity)
print("Specificity:", specificity)

Confusion Matrix:
[[735 274]
 [387 588]]

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.73      0.69      1009
           1       0.68      0.60      0.64       975

    accuracy                           0.67      1984
   macro avg       0.67      0.67      0.66      1984
weighted avg       0.67      0.67      0.67      1984


ROC AUC Score:
0.7216030088180733

Log Loss:
0.7119542113183052

Sensitivity: 0.6030769230769231
Specificity: 0.7284440039643211


No applications found for mimetype: image/png
.

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

X = df.drop("rating", axis=1)
y = df["rating"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalizace dat
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Sestavení neuronové sítě
model = Sequential()
model.add(Dense(64, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation="relu"))
model.add(Dense(1))

# Kompilace modelu
model.compile(optimizer="adam", loss="mean_squared_error")

# Trénování modelu
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

# Evaluace modelu
loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")

import plotly.graph_objects as go
import numpy as np

# Výpočet predikcí modelu
predictions = model.predict(X_test).flatten()

# Vytvoření grafu
fig = go.Figure()

# Přidání skutečných hodnot
fig.add_trace(
    go.Scatter(
        x=np.arange(len(y_test)),
        y=y_test,
        mode="markers",
        name="Skutečné hodnoty",
        marker=dict(color="blue", size=10, line=dict(color="DarkSlateGrey", width=2)),
    )
)

# Přidání predikcí
fig.add_trace(
    go.Scatter(
        x=np.arange(len(predictions)),
        y=predictions,
        mode="markers",
        name="Predikce",
    )
)

# Aktualizace rozvržení grafu
fig.update_layout(
    title="Porovnání skutečných hodnot a predikcí",
    xaxis_title="Index",
    yaxis_title="Hodnota",
    legend_title="Legenda",
)

# Zobrazení grafu
fig.show()

/usr/bin/xdg-open: 882: x-www-browser: not found
/usr/bin/xdg-open: 882: firefox: not found
/usr/bin/xdg-open: 882: iceweasel: not found
/usr/bin/xdg-open: 882: seamonkey: not found
/usr/bin/xdg-open: 882: mozilla: not found
/usr/bin/xdg-open: 882: epiphany: not found
/usr/bin/xdg-open: 882: konqueror: not found
/usr/bin/xdg-open: 882: chromium: not found
/usr/bin/xdg-open: 882: chromium-browser: not found
/usr/bin/xdg-open: 882: google-chrome: not found
/usr/bin/xdg-open: 882: www-browser: not found
/usr/bin/xdg-open: 882: links2: not found
/usr/bin/xdg-open: 882: elinks: not found
/usr/bin/xdg-open: 882: links: not found
/usr/bin/xdg-open: 882: lynx: not found
/usr/bin/xdg-open: 882: w3m: not found
xdg-open: no method available for opening 'graph.png'
