## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



## Load Data 

In [2]:
IGDB = pd.read_csv('./data/IGDB_pre_processing.csv')

In [3]:
IGDB.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
id,7995,4221,1341,4967,5549
age_ratings,[66005],[58455],"[4566, 34569, 94567, 94629, 123277, 131429]","[66179, 66256]",[2998]
category,0,0,0,0,0
collection,3138.0,687.0,361.0,,815.0
cover,199764.0,161022.0,123347.0,329337.0,5630.0
created_at,1414716913,1390750383,1348912750,1391651125,1394219557
external_games,"[52206, 132934, 1746061, 1921468]","[51645, 143610, 1709454]","[14669, 76711, 95250, 142618, 188993, 216380, ...","[35778, 128039, 247067]","[130300, 1191202]"
first_release_date,2007-03-11,2002-06-03,2003-11-11,2007-05-01,2009-06-12
follows,,,157.0,,2.0


In [4]:
IGDB.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
IGDB.dtypes

id                       int64
age_ratings             object
category                 int64
collection             float64
cover                  float64
created_at               int64
external_games          object
first_release_date      object
follows                float64
game_modes              object
genres                  object
involved_companies      object
keywords                object
name                    object
platforms               object
player_perspectives     object
release_dates           object
screenshots             object
similar_games           object
slug                    object
summary                 object
tags                    object
themes                  object
updated_at               int64
url                     object
websites                object
checksum                object
total_rating           float64
total_rating_count     float64
rating_count           float64
console                float64
operating_system       float64
year    

In [6]:
IGDB.shape

(4265, 33)

In [7]:
len(IGDB) * .7, len(IGDB) * .3

(2985.5, 1279.5)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(IGDB.drop(columns='total_rating'), 
                                                    IGDB.total_rating, test_size=0.3, 
                                                    random_state=47)

In [9]:
X_train.shape, X_test.shape

((2985, 32), (1280, 32))

In [10]:
y_train.shape, y_test.shape

((2985,), (1280,))

In [11]:
#drop non-numeric columns from `X_train` and `X_test`.
names_list = IGDB.select_dtypes(include=['object'])

In [12]:
names_list_columns = names_list.columns

In [13]:
names_train = X_train[names_list_columns]
names_test = X_test[names_list_columns]
X_train.drop(columns=names_list_columns, inplace=True)
X_test.drop(columns=names_list_columns, inplace=True)
X_train.shape, X_test.shape

((2985, 12), (1280, 12))

In [14]:
X_train.dtypes

id                      int64
category                int64
collection            float64
cover                 float64
created_at              int64
follows               float64
updated_at              int64
total_rating_count    float64
rating_count          float64
console               float64
operating_system      float64
year                    int64
dtype: object

In [15]:
X_test.dtypes

id                      int64
category                int64
collection            float64
cover                 float64
created_at              int64
follows               float64
updated_at              int64
total_rating_count    float64
rating_count          float64
console               float64
operating_system      float64
year                    int64
dtype: object

In [16]:
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[69.06432161]])

In [17]:
train_mean = y_train.mean()
train_mean

69.0643216080402

In [18]:
y_tr_pred_ = train_mean * np.ones(len(y_train))
y_tr_pred_[:5]

array([69.06432161, 69.06432161, 69.06432161, 69.06432161, 69.06432161])

In [19]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([69.06432161, 69.06432161, 69.06432161, 69.06432161, 69.06432161])

In [20]:
y_te_pred = train_mean * np.ones(len(y_test))

In [21]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.00024439880428972494)

In [22]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(10.976166595119652, 10.723575690954773)

In [23]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(205.6240034342567, 195.37713035971314)