## 4.1 Bestimmen Sie, welche Felder Ihrer Daten für Ihr Modell besonders aussagekräftig sind.

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, confusion_matrix, precision_score, recall_score
import joblib

np.random.seed(42)

df = pd.read_csv("daten metacritic.csv", sep=';')
df['r-date'] = pd.to_datetime(df['r-date'], format='%d.%m.%Y')
df['r-date'] = df['r-date'].astype('int64') // 10**9

features = ['score', 'r-date', 'critics', 'users', '1 Player', '1-2 Player', 'No Online Multiplayer', 'MP up to 32', 'MP up to 30', 'MP up to 16', 'No info', '1-4 Player']
X = df[features]
y = df['user score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

joblib.dump(gb_model, 'metacritic_model.joblib')

sample_size = 200
np.random.seed(None)
random_indices = np.random.choice(len(X_test), sample_size, replace=False)
np.save('sample_indices.npy', random_indices)

sample_X = X_test.iloc[random_indices]
sample_y = y_test.iloc[random_indices]
sample_names = df.iloc[sample_X.index]['name']
sample_platforms = df.iloc[sample_X.index]['platform']
sample_scores = df.iloc[sample_X.index]['score']

predictions = gb_model.predict(sample_X)

comparison = pd.DataFrame({
    'Game': sample_names,
    'Platform': sample_platforms,
    'Critic Score': sample_scores,
    'Actual User Score': sample_y,
    'Predicted User Score': predictions,
    'Error': np.abs(sample_y - predictions)
})


y_test_pred = gb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_test_pred)
importances = gb_model.feature_importances_

# DataFrame mit Feature-Namen und deren Wichtigkeit
feat_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
})

# Absteigend sortieren
feat_df = feat_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

# Ausgabe
print("\nFeature Importances:")
print(feat_df.round(4))



Feature Importances:
                  Feature  Importance
0                   score      0.6640
1                  r-date      0.2144
2                   users      0.0705
3   No Online Multiplayer      0.0295
4                 critics      0.0117
5              1-2 Player      0.0053
6                1 Player      0.0020
7                 No info      0.0012
8              1-4 Player      0.0008
9             MP up to 16      0.0006
10            MP up to 32      0.0000
11            MP up to 30      0.0000


## 4.2 Wählen Sie eine geeignete Messmetrik für Ihr Modell und berechnen Sie sie.

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import joblib

np.random.seed(42)

df = pd.read_csv("daten metacritic.csv", sep=';')
df['r-date'] = pd.to_datetime(df['r-date'], format='%d.%m.%Y')
df['r-date'] = df['r-date'].astype('int64') // 10**9

features = ['score', 'r-date', 'critics', 'users', '1 Player', '1-2 Player', 'No Online Multiplayer', 'MP up to 32', 'MP up to 30', 'MP up to 16', 'No info', '1-4 Player']
X = df[features]
y = df['user score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)


joblib.dump(gb_model, 'metacritic_model.joblib')

sample_size = 200
np.random.seed(None)
random_indices = np.random.choice(len(X_test), sample_size, replace=False)
np.save('sample_indices.npy', random_indices)

sample_X = X_test.iloc[random_indices]
sample_y = y_test.iloc[random_indices]
sample_names = df.iloc[sample_X.index]['name']
sample_platforms = df.iloc[sample_X.index]['platform']
sample_scores = df.iloc[sample_X.index]['score']

predictions = gb_model.predict(sample_X)


comparison = pd.DataFrame({
    'Game': sample_names,
    'Platform': sample_platforms,
    'Critic Score': sample_scores,
    'Actual User Score': sample_y,
    'Predicted User Score': predictions,
    'Error': np.abs(sample_y - predictions)
})

print("\nSample Predictions vs Actual Values:")
print(comparison.round(2))

y_test_pred = gb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_test_pred)
print(f"\nMean Absolute Error (MAE) auf Testset: {mae:.3f}")



Sample Predictions vs Actual Values:
                                                Game      Platform  \
485                                    Soldier Elite            PC   
14720            Phantasy Star Online Episode I & II      GameCube   
6062                                    Akiba's Beat  PlayStation4   
7360   LEGO Pirates of the Caribbean: The Video Game           3DS   
3245                                       Sparkle 2  PlayStation4   
...                                              ...           ...   
168                                          Randall  PlayStation4   
14268                 Trauma Center: Under the Knife            DS   
14547                       Worms Forts: Under Siege          Xbox   
9489                    DeathSpank: Thongs of Virtue            PC   
12393                                   Planet Alpha            PC   

       Critic Score  Actual User Score  Predicted User Score  Error  
485              40                3.6             

## 4.3 Wählen Sie geeignete Bedingungen und erstellen Sie eine Wahrheitsmatrix für Ihr Modell. Berechnen Sie darüber hinaus Sensitivität und Spezifizität.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, confusion_matrix, precision_score, recall_score
import joblib

np.random.seed(42)

df = pd.read_csv("daten metacritic.csv", sep=';')
df['r-date'] = pd.to_datetime(df['r-date'], format='%d.%m.%Y')
df['r-date'] = df['r-date'].astype('int64') // 10**9

features = ['score', 'r-date', 'critics', 'users', '1 Player', '1-2 Player', 'No Online Multiplayer', 'MP up to 32', 'MP up to 30', 'MP up to 16', 'No info', '1-4 Player']
X = df[features]
y = df['user score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

joblib.dump(gb_model, 'metacritic_model.joblib')

sample_size = 200
np.random.seed(None)
random_indices = np.random.choice(len(X_test), sample_size, replace=False)
np.save('sample_indices.npy', random_indices)

sample_X = X_test.iloc[random_indices]
sample_y = y_test.iloc[random_indices]
sample_names = df.iloc[sample_X.index]['name']
sample_platforms = df.iloc[sample_X.index]['platform']
sample_scores = df.iloc[sample_X.index]['score']

predictions = gb_model.predict(sample_X)

comparison = pd.DataFrame({
    'Game': sample_names,
    'Platform': sample_platforms,
    'Critic Score': sample_scores,
    'Actual User Score': sample_y,
    'Predicted User Score': predictions,
    'Error': np.abs(sample_y - predictions)
})


# Aufgabe 4.3
threshold = 7.0
y_true_bin = (y_test >= threshold).astype(int)
y_pred_bin = (y_test_pred >= threshold).astype(int)

cm = confusion_matrix(y_true_bin, y_pred_bin)
print("\nConfusion Matrix:")
print(cm)

prec = precision_score(y_true_bin, y_pred_bin)
rec  = recall_score(y_true_bin, y_pred_bin)
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")



Confusion Matrix:
[[1027  297]
 [ 580 1428]]
Precision: 0.828
Recall:    0.711


## 4.4 Fassen Sie in 50 bis 100 Wörtern zusammen, wie gut Ihr Modell funktioniert, und stellen Sie Hypothesen auf, warum.

Das Modell liefert in der Praxis solide Vorhersagen für die Nutzerbewertungen: Es erkennt zuverlässig gut bewertete Spiele und übersieht nur selten hoch gelobte Titel. Kritikerbewertungen und das Veröffentlichungsdatum haben den grössten Einfluss. Ich vermute, dass es daran liegt, dass professionelle Rezensionen näher an den Erwartungen der Spieler liegen und dass aktuelle Titel stärker im Gespräch sind und daher besser repräsentiert werden. 