# VideoGames Sales Prediction

### Overview :- 
The gaming industry is certainly one of the thriving industries of the modern age and one of those that are most influenced by the advancement in technology. With the availability of technologies like AR/VR in consumer products like gaming consoles and even smartphones, the gaming sector shows great potential. In this main objective is to predict the sales of video games depending on given factors. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.head()

Unnamed: 0,ID,CONSOLE,YEAR,CATEGORY,PUBLISHER,RATING,CRITICS_POINTS,USER_POINTS,SalesInMillions
0,2860,ds,2008,role-playing,Nintendo,E,2.833333,0.303704,1.779257
1,731,wii,2012,simulation,Konami Digital Entertainment,E10+,13.2,1.64,0.21505
2,495,pc,2019,shooter,Activision,M,4.5625,0.00641,0.534402
3,2641,ps2,2002,sports,Electronic Arts,E,4.181818,0.326923,1.383964
4,811,ps3,2013,action,Activision,M,2.259259,0.032579,0.082671


In [4]:
train.shape

(3506, 9)

In [5]:
test.head()

Unnamed: 0,ID,CONSOLE,YEAR,CATEGORY,PUBLISHER,RATING,CRITICS_POINTS,USER_POINTS
0,4310,ps2,2008,action,Tecmo Koei,T,3.928571,0.482353
1,4011,psp,2007,strategy,Atari,E10+,5.538462,0.071795
2,2185,ps2,2004,shooter,Electronic Arts,T,3.034483,0.062044
3,1644,x,2006,action,Electronic Arts,E,2.913043,0.88
4,188,3ds,2011,racing,Ubisoft,E10+,1.162162,0.183333


In [6]:
train.isnull().sum()

ID                 0
CONSOLE            0
YEAR               0
CATEGORY           0
PUBLISHER          0
RATING             0
CRITICS_POINTS     0
USER_POINTS        0
SalesInMillions    0
dtype: int64

In [7]:
train.dtypes

ID                   int64
CONSOLE             object
YEAR                 int64
CATEGORY            object
PUBLISHER           object
RATING              object
CRITICS_POINTS     float64
USER_POINTS        float64
SalesInMillions    float64
dtype: object

In [8]:
train['CONSOLE'].value_counts()

ps2     585
x360    448
ps3     438
pc      315
x       266
wii     254
ds      246
psp     180
gc      164
gba     129
ps4     121
xone     85
3ds      84
ps       83
psv      59
wiiu     45
dc        4
Name: CONSOLE, dtype: int64

In [9]:
list_console = list(train['CONSOLE'].value_counts().index)

In [10]:
#list_console

In [11]:
train['CATEGORY'].value_counts()

action          845
sports          464
shooter         441
role-playing    380
racing          296
misc            220
platform        218
fighting        187
simulation      150
strategy        132
adventure       114
puzzle           59
Name: CATEGORY, dtype: int64

In [12]:
list_category = list(train['CATEGORY'].value_counts().index)

In [13]:
#list_category

In [14]:
train['RATING'].value_counts()

T       1182
E       1061
M        749
E10+     511
RP         2
K-A        1
Name: RATING, dtype: int64

In [15]:
list_rating = list(train['RATING'].value_counts().index)

In [16]:
list_rating

['T', 'E', 'M', 'E10+', 'RP', 'K-A']

In [17]:
list_publisher = list(train['PUBLISHER'].value_counts().index)

In [18]:
len(list_publisher)

204

In [19]:
lb = LabelEncoder()

In [20]:
train.dtypes

ID                   int64
CONSOLE             object
YEAR                 int64
CATEGORY            object
PUBLISHER           object
RATING              object
CRITICS_POINTS     float64
USER_POINTS        float64
SalesInMillions    float64
dtype: object

In [21]:
train['CONSOLE'] = lb.fit_transform(train['CONSOLE'])
train['CATEGORY'] = lb.fit_transform(train['CATEGORY'])
train['PUBLISHER'] = lb.fit_transform(train['PUBLISHER'])
train['RATING'] = lb.fit_transform(train['RATING'])

In [22]:
train.head()

Unnamed: 0,ID,CONSOLE,YEAR,CATEGORY,PUBLISHER,RATING,CRITICS_POINTS,USER_POINTS,SalesInMillions
0,2860,2,2008,7,121,0,2.833333,0.303704,1.779257
1,731,12,2012,9,95,1,13.2,1.64,0.21505
2,495,5,2019,8,7,3,4.5625,0.00641,0.534402
3,2641,7,2002,10,49,0,4.181818,0.326923,1.383964
4,811,8,2013,0,7,3,2.259259,0.032579,0.082671


In [23]:
train.corr()

Unnamed: 0,ID,CONSOLE,YEAR,CATEGORY,PUBLISHER,RATING,CRITICS_POINTS,USER_POINTS,SalesInMillions
ID,1.0,-0.016937,-0.053616,-0.009618,0.01991,0.014772,-0.00524,0.025612,-0.021986
CONSOLE,-0.016937,1.0,0.2074,-0.002504,-0.025594,0.107277,-0.160852,-0.104009,0.039715
YEAR,-0.053616,0.2074,1.0,-0.111089,0.058229,0.037402,-0.040943,-0.330024,0.015476
CATEGORY,-0.009618,-0.002504,-0.111089,1.0,-0.121006,-0.167508,-0.015492,0.014426,0.026411
PUBLISHER,0.01991,-0.025594,0.058229,-0.121006,1.0,-0.0264,0.002519,-0.054574,-0.006928
RATING,0.014772,0.107277,0.037402,-0.167508,-0.0264,1.0,-0.184619,-0.16188,-0.05818
CRITICS_POINTS,-0.00524,-0.160852,-0.040943,-0.015492,0.002519,-0.184619,1.0,0.357396,-0.102378
USER_POINTS,0.025612,-0.104009,-0.330024,0.014426,-0.054574,-0.16188,0.357396,1.0,-0.140307
SalesInMillions,-0.021986,0.039715,0.015476,0.026411,-0.006928,-0.05818,-0.102378,-0.140307,1.0


In [24]:
corr = train.corr().abs()
corr[corr ==1]=0
corr_cols = corr.max().sort_values(ascending=False)

In [25]:
corr_cols

USER_POINTS        0.357396
CRITICS_POINTS     0.357396
YEAR               0.330024
CONSOLE            0.207400
RATING             0.184619
CATEGORY           0.167508
SalesInMillions    0.140307
PUBLISHER          0.121006
ID                 0.053616
dtype: float64

In [26]:
len(list_rating)

6

In [27]:
train.shape

(3506, 9)

In [28]:
train.drop(['ID','PUBLISHER','CATEGORY'],axis=1,inplace=True)

In [29]:
train.head()

Unnamed: 0,CONSOLE,YEAR,RATING,CRITICS_POINTS,USER_POINTS,SalesInMillions
0,2,2008,0,2.833333,0.303704,1.779257
1,12,2012,1,13.2,1.64,0.21505
2,5,2019,3,4.5625,0.00641,0.534402
3,7,2002,0,4.181818,0.326923,1.383964
4,8,2013,3,2.259259,0.032579,0.082671


In [30]:
test.drop(['ID','PUBLISHER','CATEGORY'],axis=1,inplace=True)

In [31]:
test.dtypes

CONSOLE            object
YEAR                int64
RATING             object
CRITICS_POINTS    float64
USER_POINTS       float64
dtype: object

In [32]:
test['CONSOLE'] = lb.fit_transform(test['CONSOLE'])

In [33]:
test['RATING'] = lb.fit_transform(test['RATING'])

In [34]:
X = train.drop('SalesInMillions',axis=1)
y = train['SalesInMillions']

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

In [37]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [38]:
ms = MinMaxScaler()
x_train = ms.fit_transform(x_train)
x_test = ms.fit_transform(x_test)

In [39]:
from sklearn.linear_model import LinearRegression

In [40]:
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [41]:
y_pred = lr.predict(x_test)

In [42]:
#y_pred

In [43]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [44]:
r2Score2 = r2_score(y_test,y_pred)

In [45]:
r2Score2

0.03336806145842752

In [46]:
rmse2 = np.sqrt(mean_squared_error(y_test,y_pred))

In [47]:
rmse2

1.9375358356138397

In [48]:
test.head()

Unnamed: 0,CONSOLE,YEAR,RATING,CRITICS_POINTS,USER_POINTS
0,7,2008,4,3.928571,0.482353
1,10,2007,2,5.538462,0.071795
2,7,2004,4,3.034483,0.062044
3,14,2006,1,2.913043,0.88
4,0,2011,2,1.162162,0.183333


In [49]:
test_x = test.iloc[:,:]
test_x = ms.fit_transform(test_x)

In [50]:
final_prediction = lr.predict(test_x)

In [51]:
final_prediction

array([1.71861965, 2.33693258, 2.14935428, ..., 2.28308099, 1.85207254,
       2.30073033])

In [52]:
final_df = pd.DataFrame(final_prediction,columns=['SalesInMillions'])

In [53]:
final_df.to_csv('Submission1.csv',index=False)