#### import external libraries

In [48]:
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [49]:
# setup the directory
import sys
sys.path.append('../../')

#### IPython extensions

In [50]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### import internal libraries

In [51]:
from preprocess.utils import preprocess as ps
from preprocess.utils import visualizations as vs
from preprocess.utils import exploratory_analysis as EDA

## Constants

In [52]:
PATH_FILE = '../../data/datasets/'
FILE_NAME = 'Base_de_Datos.csv'

In [53]:
DROP_COLS = ['Year','Publisher']

## Data gathering

In [54]:
df = pd.read_csv(PATH_FILE + FILE_NAME)
df.head(n=2)

Unnamed: 0,Year,Publisher,Global_Sales,Action,Platform,Adventure,Puzzle,Shooter,Misc,Sports,Racing,Simulation,Fighting,Role-Playing,Strategy
0,1983,Activision,1.94,2,1,0,0,0,0,0,0,0,0,0,0
1,1983,Nintendo,10.96,0,4,0,0,0,1,1,0,0,0,0,0


### First filter before splitting and preprocessing

In [55]:
def filter_dataframe(df, col, val):
    '''
    Filter dataframes per column and value
    '''
    col = df.columns[col]
    val = str(val)
    df = df.query(f"{col}==@val")
    return df

In [56]:
df_nintendo = filter_dataframe(df, col=1, val='Nintendo')
df_nintendo.head(n=8)

Unnamed: 0,Year,Publisher,Global_Sales,Action,Platform,Adventure,Puzzle,Shooter,Misc,Sports,Racing,Simulation,Fighting,Role-Playing,Strategy
1,1983,Nintendo,10.96,0,4,0,0,0,1,1,0,0,0,0,0
3,1984,Nintendo,45.56,1,0,0,1,2,1,2,2,0,0,0,0
5,1985,Nintendo,49.95,1,4,0,1,0,0,1,0,0,0,0,0
6,1986,Nintendo,16.18,3,2,0,0,0,0,0,1,0,0,0,0
8,1987,Nintendo,11.95,0,0,1,0,0,0,1,0,0,2,0,0
10,1988,Nintendo,36.44,0,3,0,1,0,0,1,1,0,0,0,0
12,1989,Nintendo,63.88,0,1,0,3,1,1,3,0,0,0,0,0
13,1990,Nintendo,35.49,0,1,0,2,0,0,1,2,1,0,0,0


In [57]:
df_nintendo.shape

(33, 15)

## Data split

In [58]:
df_nintendo_train, df_nintendo_test = train_test_split(df_nintendo, random_state=42, test_size=0.15)
df_nintendo_train.shape , df_nintendo_test.shape

((28, 15), (5, 15))

## Preprocessing

In [59]:
# Change the data type: from object to datetime
df_nintendo_train = ps.preprocess(df=df_nintendo_train,
                         drop_cols=DROP_COLS)
df_nintendo_train.head(n=2)

Unnamed: 0,Action,Platform,Adventure,Puzzle,Shooter,Misc,Sports,Racing,Simulation,Fighting,Role-Playing,Strategy,y
16,1,1,1,4,0,2,0,1,0,0,0,0,38.11
59,6,4,4,0,1,2,1,0,1,0,3,0,48.31


## Multiple linear regression

#### Training

In [60]:
# Dimensions of the df
df_nintendo_train.shape

(28, 13)

In [61]:
# Separate the predictive and target features; containts train and validation data
y = df_nintendo_train['y']
X = df_nintendo_train.drop(['y'], axis = 1)
X.head(n=2)

Unnamed: 0,Action,Platform,Adventure,Puzzle,Shooter,Misc,Sports,Racing,Simulation,Fighting,Role-Playing,Strategy
16,1,1,1,4,0,2,0,1,0,0,0,0
59,6,4,4,0,1,2,1,0,1,0,3,0


In [62]:
# Select the optimal num of bins
lenght_df = df_nintendo_train.shape[0]
nbins = EDA.number_of_bins(n=lenght_df)

# Histogram of monthly averages
vs.histogram_plot(df=df_nintendo_train,
                  rand_var='y',
                  nbins=nbins,
                  title='Histogram of the target feature')

In [63]:
# Boxplot of global sales
vs.box_plot(df=df_nintendo_train,
            rand_var='y',
            title='Boxplot of target variable')

In [64]:
# set up the model
model = LinearRegression()
model

LinearRegression()

In [65]:
# train the model
model.fit(X, y)

LinearRegression()

In [74]:
model.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

#### Testing

In [66]:
# test dataset
df_nintendo_test = ps.preprocess(df=df_nintendo_test,
                         drop_cols=DROP_COLS)
df_nintendo_test.head(n=2)

Unnamed: 0,Action,Platform,Adventure,Puzzle,Shooter,Misc,Sports,Racing,Simulation,Fighting,Role-Playing,Strategy,y
119,4,3,0,1,0,2,2,1,1,2,4,0,48.65
39,1,3,1,1,0,1,3,2,2,0,1,2,48.41


In [67]:
# Separate the variables
y_nintendo_test = df_nintendo_test['y'].reset_index(drop=True)
x_nintendo_test = df_nintendo_test.drop(['y'], axis = 1)
x_nintendo_test.head(n=2)

Unnamed: 0,Action,Platform,Adventure,Puzzle,Shooter,Misc,Sports,Racing,Simulation,Fighting,Role-Playing,Strategy
119,4,3,0,1,0,2,2,1,1,2,4,0
39,1,3,1,1,0,1,3,2,2,0,1,2


In [68]:
# predict
y_nintendo_pred = model.predict(x_nintendo_test)
y_nintendo_pred

array([74.30717325, 81.60108513, 87.10936469, 71.02627985, 35.44758091])

In [69]:
# Check the Mean Absolute Error (MAE)
MAE = metrics.mean_absolute_error(y_true=y_nintendo_test, y_pred=y_nintendo_pred)
print(f"Mean Absolute Error of linear regression Nintendo: {MAE:.3f}")

Mean Absolute Error of linear regression Nintendo: 31.417


In [72]:
# coefficients of the model
W = model.coef_
b = model.intercept_
final_model = {'W' : W, 
               'b' : b}
final_model

{'W': array([ 1.4347142 , -0.13305766, -2.84890979,  3.69113347, -8.74696868,
         7.06827802, 16.77245874,  9.00126536,  8.56099942,  7.50349829,
        -4.00520049, -4.91404306]),
 'b': 1.0464230428550039}

## Multiple linear regression for all pusliher types

In [23]:
# Unique values
unique_publisher = df.Publisher.unique()

In [30]:
# recycling code
MAE_linear_reg = {}
predictions_linear_reg = {}

for publisher in unique_publisher:
    # training section
    df_temp = filter_dataframe(df, col=1, val=publisher)
    df_train, df_test = train_test_split(df_temp, random_state=42, test_size=0.15)
    df_train = ps.preprocess(df=df_train,
                             drop_cols=DROP_COLS)
    y = df_train['y']
    X = df_train.drop(['y'], axis = 1)
    # set up the model
    model = LinearRegression()
    model.fit(X, y)
    # testing section
    df_test = ps.preprocess(df=df_test,
                             drop_cols=DROP_COLS)
    y_test = df_test['y'].reset_index(drop=True)
    x_test = df_test.drop(['y'], axis = 1)
    y_pred = model.predict(x_test)
    predictions_linear_reg[publisher] =  y_pred
    MAE = metrics.mean_absolute_error(y_true=y_test, y_pred=y_pred)
    MAE_linear_reg[publisher] = MAE
    print(f"Mean Absolute Error (MAE) of {publisher}: : {MAE:.4f}")

Mean Absolute Error (MAE) of Activision: : 18.9642
Mean Absolute Error (MAE) of Nintendo: : 31.4166
Mean Absolute Error (MAE) of Electronic Arts: : 21.6939
Mean Absolute Error (MAE) of Sony Computer Entertainment: : 16.5260
Mean Absolute Error (MAE) of Ubisoft: : 14.0747


In [41]:
# diction to dataframe: predictions
df_predictions = pd.DataFrame.from_dict(predictions_linear_reg, orient='index').reset_index()
df_predictions =  df_predictions.rename(columns = {0 : 'prediction1', 1 : 'prediction2', 2 : 'prediction3',
                                                      3 : 'prediction4', 4 : 'prediction5', 'index' : 'Publisher'})
df_predictions

Unnamed: 0,Publisher,prediction1,prediction2,prediction3,prediction4,prediction5
0,Activision,-1.023599,32.571526,-1.710099,4.904506,5.453113
1,Nintendo,74.307173,81.601085,87.109365,71.02628,35.447581
2,Electronic Arts,108.528945,95.682671,14.199076,19.565238,
3,Sony Computer Entertainment,16.42889,28.407351,39.027552,49.244994,
4,Ubisoft,-2.535053,22.116794,7.17047,0.069163,


In [40]:
# diction to dataframe: MAE
df_MAE = pd.DataFrame.from_dict(MAE_linear_reg, orient='index').reset_index()
df_MAE = df_MAE.rename(columns = {0 : 'MAE', 'index' : 'Publisher'})
df_MAE

Unnamed: 0,Publisher,MAE
0,Activision,18.964156
1,Nintendo,31.416551
2,Electronic Arts,21.693864
3,Sony Computer Entertainment,16.526021
4,Ubisoft,14.074656


In [42]:
# Merge df
df_final_results = df_MAE.merge(df_predictions, on = 'Publisher')
df_final_results

Unnamed: 0,Publisher,MAE,prediction1,prediction2,prediction3,prediction4,prediction5
0,Activision,18.964156,-1.023599,32.571526,-1.710099,4.904506,5.453113
1,Nintendo,31.416551,74.307173,81.601085,87.109365,71.02628,35.447581
2,Electronic Arts,21.693864,108.528945,95.682671,14.199076,19.565238,
3,Sony Computer Entertainment,16.526021,16.42889,28.407351,39.027552,49.244994,
4,Ubisoft,14.074656,-2.535053,22.116794,7.17047,0.069163,


In [45]:
# save the predictions
df_final_results.to_csv('../../analysis/predictions/df_predictions_linearreg_standard.csv', index=False)