### Steps performed in this notebook
---
1. Importing the data
2. Data cleaning
3. Splitting the data into train and test
4. Model building
5. Training the model
6. Predicting the test data
7. Evaluating the model and improving it

Libraries used: pandas, numpy, matplotlib, seaborn, sklearnS

In [1]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Practicing some basic pandas functions on video game sales dataset

In [3]:
# importing the dataset
df = pd.read_csv('./data/video_games_sales/vgsales.csv')
df.head(4)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0


In [4]:
df.shape

(16598, 11)

In [6]:
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [8]:
df.describe(exclude=np.number)

Unnamed: 0,Name,Platform,Genre,Publisher
count,16598,16598,16598,16540
unique,11493,31,12,578
top,Need for Speed: Most Wanted,DS,Action,Electronic Arts
freq,12,2163,3316,1351


In [9]:
df.values

array([[1, 'Wii Sports', 'Wii', ..., 3.77, 8.46, 82.74],
       [2, 'Super Mario Bros.', 'NES', ..., 6.81, 0.77, 40.24],
       [3, 'Mario Kart Wii', 'Wii', ..., 3.79, 3.31, 35.82],
       ...,
       [16598, 'SCORE International Baja 1000: The Official Game', 'PS2',
        ..., 0.0, 0.0, 0.01],
       [16599, 'Know How 2', 'DS', ..., 0.0, 0.0, 0.01],
       [16600, 'Spirits & Spells', 'GBA', ..., 0.0, 0.0, 0.01]],
      dtype=object)

In [10]:
df.columns

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='object')

In [11]:
df.index

RangeIndex(start=0, stop=16598, step=1)

#### Working with the actual data which we intend to use for Music Recommendation System

In [12]:
# importing the dataset
df = pd.read_csv('./data/music_data/music.csv')
df.head(4)

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz


In [13]:
df.shape

(18, 3)

In [14]:
X = df.drop(columns=['genre'], axis=1)
y = df['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [15]:
X.head()

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1


In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [35]:
# creating the model
model = DecisionTreeClassifier()

In [36]:
# training the model
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [37]:
# predicting the test data
y_pred = model.predict(X_test)


In [38]:
# evaluating the model
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [39]:
score

0.75

In [40]:
# saving the model
from joblib import dump, load

dump(model, './models/music_recommender.joblib')

['./models/music_recommender.joblib']

In [41]:
# loading the model
model = load('./models/music_recommender.joblib')


In [42]:
# predicting the test data
y_pred = model.predict(X_test)
y_pred

array(['Jazz', 'Classical', 'Dance', 'Classical'], dtype=object)

In [43]:
X_test

Unnamed: 0,age,gender
4,29,1
16,34,0
12,26,0
15,31,0


In [None]:
# evaluating the model
