In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

In [38]:
df = pd.read_csv('../data/cleaned_data_set_v3.csv', sep=',')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20343 entries, 0 to 20342
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Min Players         20343 non-null  int64  
 1   Max Players         20343 non-null  int64  
 2   Play Time           20343 non-null  float64
 3   Min Age             20343 non-null  int64  
 4   Users Rated         20343 non-null  int64  
 5   Rating Average      20343 non-null  float64
 6   BGG Rank            20343 non-null  int64  
 7   Complexity Average  20343 non-null  float64
 8   Owned Users         20320 non-null  float64
 9   Mechanics           20343 non-null  object 
 10  Domains             9663 non-null   object 
dtypes: float64(4), int64(5), object(2)
memory usage: 1.7+ MB


In [39]:
df_non_null_domains = df[df['Domains'].notna()]

df_non_null_domains['Domains_Split'] = df_non_null_domains['Domains'].apply(lambda x: [d.strip() for d in x.split(',')])

all_domains = list(set([domain for sublist in df_non_null_domains['Domains_Split'] for domain in sublist if domain]))

for domain in all_domains:
    df_non_null_domains[domain] = df_non_null_domains['Domains_Split'].apply(lambda x: 1 if domain in x else 0)

domain_counts = df_non_null_domains[all_domains].sum()

print(domain_counts.sort_values(ascending=False))

df = df.merge(df_non_null_domains[all_domains], left_index=True, right_index=True, how='left')

df

Wargames              3148
Strategy Games        2089
Family Games          2066
Thematic Games        1123
Abstract Games        1012
Children's Games       801
Party Games            571
Customizable Games     284
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_null_domains['Domains_Split'] = df_non_null_domains['Domains'].apply(lambda x: [d.strip() for d in x.split(',')])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_null_domains[domain] = df_non_null_domains['Domains_Split'].apply(lambda x: 1 if domain in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Rating Average,BGG Rank,Complexity Average,Owned Users,Mechanics,Domains,Thematic Games,Abstract Games,Party Games,Family Games,Strategy Games,Customizable Games,Children's Games,Wargames
0,1,4,120.000000,14,42055,8.792440,1,3.8604,68323.0,"Action Queue, Action Retrieval, Campaign / Bat...","Strategy Games, Thematic Games",1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,4,60.000000,13,41643,8.612780,2,2.8405,65294.0,"Action Points, Cooperative Game, Hand Manageme...","Strategy Games, Thematic Games",1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2,4,120.000000,14,19217,8.663370,3,3.9129,28785.0,"Hand Management, Income, Loans, Market, Networ...",Strategy Games,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,5,120.000000,12,64864,8.432540,4,3.2406,87099.0,"Card Drafting, Drafting, End Game Bonuses, Han...",Strategy Games,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3,6,431.858995,14,13468,8.453179,5,4.2219,16831.0,"Action Drafting, Area Majority / Influence, Ar...","Strategy Games, Thematic Games",1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20338,2,2,30.000000,4,1340,2.278560,20340,1.0000,427.0,Unknown,Children's Games,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20339,2,99,60.000000,5,2154,2.853310,20341,1.0455,1533.0,"Betting and Bluffing, Bingo, Pattern Recognition",Party Games,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
20340,2,4,30.000000,3,4006,3.177920,20342,1.0779,5788.0,Roll / Spin and Move,Children's Games,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20341,2,6,30.000000,3,3783,2.855670,20343,1.0201,4400.0,"Dice Rolling, Grid Movement, Race, Roll / Spin...",Children's Games,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
df_non_null_domains = df[df['Domains'].notna()]

features = [
    "Min Players", "Max Players", "Play Time", "Min Age", "Users Rated", 
    "Rating Average", "BGG Rank", "Complexity Average", "Owned Users"
]
X = df_non_null_domains[features]

domain_columns = [
    'Strategy Games', 'Thematic Games', 'Wargames', 'Family Games', 
    'Customizable Games', 'Abstract Games', 'Party Games', 'Children\'s Games'
]
y = df_non_null_domains[domain_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(
        class_weight='balanced', random_state=42, n_jobs=-1)))
])

In [5]:
param_grid = {
    'classifier__estimator__n_estimators': [100, 200, 300],
    'classifier__estimator__max_depth': [5, 10, 20],
    'classifier__estimator__min_samples_split': [2, 5, 10],
    'classifier__estimator__min_samples_leaf': [1, 2, 4],
    'classifier__estimator__max_features': ['sqrt', 'log2', None],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3, 
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
joblib.dump(best_model, '../models/domains_best_model.pkl')
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'classifier__estimator__max_depth': 20, 'classifier__estimator__max_features': 'sqrt', 'classifier__estimator__min_samples_leaf': 2, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 300}


In [43]:
y_pred = best_model.predict(X_test)
classification_report_output = classification_report(y_test, y_pred, target_names=domain_columns)
print("Classification Report:")
print(classification_report_output)

Classification Report:
                    precision    recall  f1-score   support

    Strategy Games       0.81      0.96      0.88       403
    Thematic Games       0.80      0.86      0.83       225
          Wargames       0.95      0.96      0.95       617
      Family Games       0.81      0.95      0.87       417
Customizable Games       0.96      0.75      0.84        63
    Abstract Games       0.84      0.90      0.87       205
       Party Games       0.84      0.92      0.88       122
  Children's Games       0.86      0.96      0.91       153

         micro avg       0.86      0.93      0.89      2205
         macro avg       0.86      0.91      0.88      2205
      weighted avg       0.86      0.93      0.89      2205
       samples avg       0.88      0.94      0.90      2205



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
df_missing_domains = df[df['Domains'].isna()]
X_missing = df_missing_domains[features]
missing_predictions = best_model.predict(X_missing)

assert df_missing_domains.shape[0] == missing_predictions.shape[0], \
    "Mismatch between rows and predictions"

for i, (index, row) in enumerate(df_missing_domains.iterrows()):
    predicted_domains = missing_predictions[i]
    predicted_domains_list = [domain_columns[j] for j, val in enumerate(predicted_domains) if val == 1]
    predicted_domains_str = ", ".join(predicted_domains_list)

    df.loc[index, 'Domains'] = predicted_domains_str if pd.isna(df.loc[index, 'Domains']) else \
        f"{df.loc[index, 'Domains']}, {predicted_domains_str}"
df

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Rating Average,BGG Rank,Complexity Average,Owned Users,Mechanics,Domains,Thematic Games,Abstract Games,Party Games,Family Games,Strategy Games,Customizable Games,Children's Games,Wargames
0,1,4,120.000000,14,42055,8.792440,1,3.8604,68323.0,"Action Queue, Action Retrieval, Campaign / Bat...","Strategy Games, Thematic Games",1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,4,60.000000,13,41643,8.612780,2,2.8405,65294.0,"Action Points, Cooperative Game, Hand Manageme...","Strategy Games, Thematic Games",1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2,4,120.000000,14,19217,8.663370,3,3.9129,28785.0,"Hand Management, Income, Loans, Market, Networ...",Strategy Games,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,5,120.000000,12,64864,8.432540,4,3.2406,87099.0,"Card Drafting, Drafting, End Game Bonuses, Han...",Strategy Games,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3,6,431.858995,14,13468,8.453179,5,4.2219,16831.0,"Action Drafting, Area Majority / Influence, Ar...","Strategy Games, Thematic Games",1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20338,2,2,30.000000,4,1340,2.278560,20340,1.0000,427.0,Unknown,Children's Games,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20339,2,99,60.000000,5,2154,2.853310,20341,1.0455,1533.0,"Betting and Bluffing, Bingo, Pattern Recognition",Party Games,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
20340,2,4,30.000000,3,4006,3.177920,20342,1.0779,5788.0,Roll / Spin and Move,Children's Games,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20341,2,6,30.000000,3,3783,2.855670,20343,1.0201,4400.0,"Dice Rolling, Grid Movement, Race, Roll / Spin...",Children's Games,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:
df.drop(domain_columns, axis=1, inplace=True)

In [46]:
df.to_csv('../data/cleaned_data_set_v4.csv', sep=',', index=False)

In [47]:
df = pd.read_csv('../data/cleaned_data_set_v4.csv', sep=',')
df['Domains'] = df['Domains'].fillna("Unknown")
df.to_csv('../data/cleaned_data_set_v4.csv', sep=',', index=False)