In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBRegressor
from numpy import sqrt

In [20]:
# Read data
df = pd.read_csv("data/games.csv")

In [21]:
# Drop rows containing Playtest in the name as not needed
df['Name'] = df['Name'].astype(str)
df = df[df["Name"].str.contains("playtest|demo")==False]

# Drop columns containing too much empty data
df = df.drop(['Reviews','Website','Support url','Support email','Metacritic url',
    'Metacritic score','User score','Score rank','Notes', 'About the game', 'Header image', 'AppID', 'Name', 'Screenshots', 'Movies', 'Release date'
    , 'Supported languages', 'Full audio languages', 'Developers', 'Publishers'], axis=1)

In [22]:
# Fill NA values in string columns
df['Tags'] = df['Tags'].fillna("None")
#df['Developers'].fillna("Unknown", inplace=True)
#df['Publishers'].fillna("Unknown", inplace=True)
df = df[df['Categories'].notna()]

# Subset dataframe based on if column contains NaN values or if the game is a playtest
df = df[df['Genres'].notna()]
df = df[df['Categories'].notna()]

In [23]:
# Explode the columns containing genres, tags and categories. Also languages
df = df.join(df.Categories.str.get_dummies(',').add_prefix("category_"))
df = df.join(df.Genres.str.get_dummies(',').add_prefix("genre_"))
df = df.join(df.Tags.str.get_dummies(',').add_prefix("tag_"))

In [24]:
# Create a new column with the midpoint of the range
df['Estimated Owners Midpoint'] = df['Estimated owners'].apply(lambda x: (int(x.split(' - ')[0]) + int(x.split(' - ')[1]))/2)
df.drop(['Estimated owners'], axis=1, inplace=True)

# Encode TRUE and FALSE columns
df = df.replace({True: 1, False: 0})

# Drop exploded columns
df = df.drop(['Categories', 'Genres', 'Tags'], axis=1)

In [25]:
# Optional to export preprocessed df to a new csv
df.to_csv("data/preprocessed_full.csv", index=False)

In [26]:
# Assign dependent and independent variables 
X = df.drop('Positive', axis=1)
y = df['Positive']

# Create the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.85)),
    ('regressor', XGBRegressor())
])

In [27]:
# Measure the performance of the model
r2 = cross_val_score(pipe, X, y, cv=5, scoring='r2')
mse = cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_squared_error')
mae = cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error')

# Define the steps names for later use
scaler_name = type(pipe.named_steps['scaler']).__name__
model_name = type(pipe.named_steps['regressor']).__name__

print("Metrics for Model: {} using Scaler: {}".format(model_name, scaler_name))
print("Average CV Score (r2): {:.2f}".format(r2.mean()))
print("Average CV Score (MSE): {:.2f}".format(mse.mean()))
print("Average CV Score (MAE): {:.2f}".format(mae.mean()))

Metrics for Model: XGBRegressor using Scaler: StandardScaler
Average CV Score (r2): 0.74
Average CV Score (MSE): -344293035.29
Average CV Score (MAE): -1035.75
