In [72]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import hamming_loss, f1_score

In [69]:
data = pd.read_csv(r"C:\Users\corbi\.cache\kagglehub\datasets\melissamonfared\board-games\versions\1\BGG_Data_Set.csv", encoding='ISO-8859-1')
# Replace NaN values with empty strings
data['Mechanics'] = data['Mechanics'].fillna('')
data['Domains'] = data['Domains'].fillna('')

# Remove rows where 'Domains' is still empty
data = data[data['Domains'] != ""]
data = data.reset_index(drop=True)

# Split strings into lists and clean them
data['Mechanics'] = data['Mechanics'].str.split(', ').apply(lambda x: [item.strip() for item in x if item.strip()])
data['Domains'] = data['Domains'].str.split(', ').apply(lambda x: [item.strip() for item in x if item.strip()])

scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(data[['Min Players', 'Max Players', 'Play Time', 'Min Age', 'Rating Average', 'BGG Rank', 'Complexity Average']])
scaled_numeric_df = pd.DataFrame(scaled_numeric, columns=['Min Players', 'Max Players', 'Play Time', 'Min Age', 'Rating Average', 'BGG Rank', 'Complexity Average'])

# Binarize the cleaned lists
mlb = MultiLabelBinarizer()
attribute_matrix = pd.DataFrame(mlb.fit_transform(data['Mechanics']), columns=mlb.classes_)
domain_matrix = pd.DataFrame(mlb.fit_transform(data['Domains']), columns=mlb.classes_)

# Build feature matrix from the same filtered `data`
feature_matrix = pd.concat([scaled_numeric_df, attribute_matrix], axis=1)


In [129]:
x_train, x_test, y_train, y_test = train_test_split(feature_matrix, domain_matrix, test_size=0.2, random_state=42)


In [71]:
knn = MultiOutputClassifier(KNeighborsClassifier())

param_grid_knn = {
    'estimator__n_neighbors': [10,11,12],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__metric': ['euclidean', 'manhattan']
}
knn_grid = GridSearchCV(knn, param_grid_knn, cv=5)
knn_grid.fit(x_train, y_train)
print("Best KNN parameters:", knn_grid.best_params_)

Best KNN parameters: {'estimator__metric': 'manhattan', 'estimator__n_neighbors': 11, 'estimator__weights': 'distance'}


In [130]:

y_pred = knn_grid.best_estimator_.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy}")
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("F1 Score (micro):", f1_score(y_test, y_pred, average='micro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))

KNN Accuracy: 0.6200294550810015
Hamming Loss: 0.0703240058910162
F1 Score (micro): 0.729589428975932
F1 Score (macro): 0.6247459019197275
