# Prediction Model for Video games sales

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

df = pd.read_csv("vgsales.csv")
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# Converting the categorical values in Platform, Genre, Publisher into a numerical values

In [2]:
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
df['Platform'] = number.fit_transform(df['Platform'].astype('str'))
df['Genre'] = number.fit_transform(df['Genre'].astype('str'))
df['Publisher'] = number.fit_transform(df['Publisher'].astype('str'))

In [3]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,26,2006.0,10,359,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,11,1985.0,4,359,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,26,2008.0,6,359,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,26,2009.0,10,359,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,5,1996.0,7,359,11.27,8.89,10.22,1.0,31.37


# Now that our dataset has been converted into numeric values, we can remove certain features like name and year as they are not a good features

In [4]:
dff = df.drop(['Rank','Name', 'Year'], axis =1)

In [5]:
dff.head()

Unnamed: 0,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,26,10,359,41.49,29.02,3.77,8.46,82.74
1,11,4,359,29.08,3.58,6.81,0.77,40.24
2,26,6,359,15.85,12.88,3.79,3.31,35.82
3,26,10,359,15.75,11.01,3.28,2.96,33.0
4,5,7,359,11.27,8.89,10.22,1.0,31.37


# For experiment purpose we only consider the first 3 features

In [9]:
df3 = dff.drop(['NA_Sales','EU_Sales', 'JP_Sales', 'Other_Sales'], axis =1)
df3.head()

Unnamed: 0,Platform,Genre,Publisher,Global_Sales
0,26,10,359,82.74
1,11,4,359,40.24
2,26,6,359,35.82
3,26,10,359,33.0
4,5,7,359,31.37


In [11]:
columns = ["Platform", "Genre", "Publisher"]
 
labels = df3["Global_Sales"].values
features = df3[list(columns)].values

# Creating the model and passing the values

In [13]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

regr = linear_model.LinearRegression()

X = features
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
Accuracy = regr.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"

accuracy = regr.score(X_test, y_test)
print "Accuracy in the test data", accuracy*100, "%"

Accuracy in the training data:  0.157597920801 %
Accuracy in the test data 0.110121532305 %


# Super bad performance, we try normalizing the dataset

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#scaler = preprocessing.MinMaxScaler()

# Fit only on training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# apply same transformation to test data
X_test = scaler.transform(X_test)

regr.fit(X_train, y_train)

Accuracy = regr.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"

accuracy = regr.score(X_test, y_test)
print "Accuracy in the test data", accuracy*100, "%"

Accuracy in the training data:  0.157597920801 %
Accuracy in the test data 0.110121532305 %


# trying the lasso model

In [17]:
reg = linear_model.Lasso(alpha = 0.1)

reg.fit(X_train, y_train)

Accuracy = reg.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"

accuracy = reg.score(X_test, y_test)
print "Accuracy in the test data", accuracy*100, "%"

Accuracy in the training data:  0.0 %
Accuracy in the test data -0.0168082800485 %
