##### importing essential libraries:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('dark_background')

### Step 1: Data Preprocessing:

#### Step 1.1: fetching the data from the dataset:

In [2]:
df_meta_data=pd.read_csv('CSV_Files/train.csv')
df_meta_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


##### fetching the shape of this dataset:

In [3]:
df_meta_data.shape

(114000, 21)

#### Reducing datapoints for faster processing:

In this ML project, we are considering tracks only from top 10 genres, for faster preprocessing and low processing power

##### Finding out all the genres available in this dataset:

In [5]:
df_meta_data['track_genre'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie-pop', 'indie', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop-film', 'pop',
       'pow

The top 10 music genres are: pop,rock,hip-hop,electronic,jazz,classical,r-n-b,country,blues,reggae

In [6]:
top_10_genres=['pop','rock','hip-hop','electronic','jazz','classical','r-n-b','country','blues','reggae']

##### Fetching this subset of tracks that belongs to top 10 genres, this subset will be used in our ML project:

In [7]:
df_meta_data1=df_meta_data[df_meta_data['track_genre'].isin(top_10_genres)]
df_meta_data1.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
8000,8000,5MAK1nd8R6PWnle1Q1WJvh,Everybody Loves an Outlaw,I See Red,I See Red,77,230613,False,0.509,0.448,...,-7.552,0,0.0357,0.00713,0.0137,0.244,0.221,156.909,3,blues
8001,8001,2tznHmp70DxMyr2XhWLOW0,Cage The Elephant,Melophobia,Cigarette Daydreams,79,208760,False,0.636,0.676,...,-3.442,1,0.0263,0.0807,0.0,0.0831,0.273,113.98,4,blues
8002,8002,7vguMCv8uVuZLiQJ156u3Z,Sam Tinnesz;Yacht Money,Play with Fire (feat. Yacht Money),Play with Fire (feat. Yacht Money),76,180690,False,0.573,0.539,...,-6.091,0,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,4,blues
8003,8003,3dPQuX8Gs42Y7b454ybpMR,The White Stripes,Elephant,Seven Nation Army,84,232106,False,0.743,0.446,...,-7.807,0,0.0683,0.0039,0.118,0.337,0.278,123.596,4,blues
8004,8004,6zC0mpGYwbNTpk9SKwh08f,Eric Clapton,Slowhand 35th Anniversary (Super Deluxe),Wonderful Tonight,76,225026,False,0.572,0.214,...,-15.625,1,0.0293,0.649,0.129,0.125,0.485,95.542,4,blues


##### Fetching the shape of our dataset that will be used in our ML project:

In [8]:
df_meta_data1.shape

(10000, 21)

#### Step 1.2: Data Cleaning

Removing the instances that has null values in it

In [9]:
df_meta_data1=df_meta_data1.dropna()

In [10]:
# evaluating the shape of our dataset after removing the instances with null values:

df_meta_data1.shape

(10000, 21)

##### Resetting indices:

In [12]:
df_meta_data1=df_meta_data1.reset_index(drop=True)
df_meta_data1.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,8000,5MAK1nd8R6PWnle1Q1WJvh,Everybody Loves an Outlaw,I See Red,I See Red,77,230613,False,0.509,0.448,...,-7.552,0,0.0357,0.00713,0.0137,0.244,0.221,156.909,3,blues
1,8001,2tznHmp70DxMyr2XhWLOW0,Cage The Elephant,Melophobia,Cigarette Daydreams,79,208760,False,0.636,0.676,...,-3.442,1,0.0263,0.0807,0.0,0.0831,0.273,113.98,4,blues
2,8002,7vguMCv8uVuZLiQJ156u3Z,Sam Tinnesz;Yacht Money,Play with Fire (feat. Yacht Money),Play with Fire (feat. Yacht Money),76,180690,False,0.573,0.539,...,-6.091,0,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,4,blues
3,8003,3dPQuX8Gs42Y7b454ybpMR,The White Stripes,Elephant,Seven Nation Army,84,232106,False,0.743,0.446,...,-7.807,0,0.0683,0.0039,0.118,0.337,0.278,123.596,4,blues
4,8004,6zC0mpGYwbNTpk9SKwh08f,Eric Clapton,Slowhand 35th Anniversary (Super Deluxe),Wonderful Tonight,76,225026,False,0.572,0.214,...,-15.625,1,0.0293,0.649,0.129,0.125,0.485,95.542,4,blues


##### Dropping unnecessary columns:

In [13]:
df=df_meta_data1.drop(['Unnamed: 0','track_id','artists','album_name','duration_ms','explicit','key','mode','time_signature'],axis=1)
df

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
0,I See Red,77,0.509,0.448,-7.552,0.0357,0.00713,0.013700,0.2440,0.221,156.909,blues
1,Cigarette Daydreams,79,0.636,0.676,-3.442,0.0263,0.08070,0.000000,0.0831,0.273,113.980,blues
2,Play with Fire (feat. Yacht Money),76,0.573,0.539,-6.091,0.0322,0.01620,0.000012,0.1050,0.390,75.012,blues
3,Seven Nation Army,84,0.743,0.446,-7.807,0.0683,0.00390,0.118000,0.3370,0.278,123.596,blues
4,Wonderful Tonight,76,0.572,0.214,-15.625,0.0293,0.64900,0.129000,0.1250,0.485,95.542,blues
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Christmas All Over Again,0,0.440,0.838,-6.196,0.0423,0.09560,0.000000,0.4730,0.619,138.908,rock
9996,Christmas All Over Again,0,0.440,0.838,-6.196,0.0423,0.09560,0.000000,0.4730,0.619,138.908,rock
9997,Christmas All Over Again,0,0.440,0.838,-6.196,0.0423,0.09560,0.000000,0.4730,0.619,138.908,rock
9998,Christmas All Over Again,0,0.440,0.838,-6.196,0.0423,0.09560,0.000000,0.4730,0.619,138.908,rock


In [14]:
df.shape

(10000, 12)

##### Allocating a unique id to each song/track:

In [16]:
df['track_id']=df.index
df.head()

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,track_id
0,I See Red,77,0.509,0.448,-7.552,0.0357,0.00713,0.0137,0.244,0.221,156.909,blues,0
1,Cigarette Daydreams,79,0.636,0.676,-3.442,0.0263,0.0807,0.0,0.0831,0.273,113.98,blues,1
2,Play with Fire (feat. Yacht Money),76,0.573,0.539,-6.091,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,blues,2
3,Seven Nation Army,84,0.743,0.446,-7.807,0.0683,0.0039,0.118,0.337,0.278,123.596,blues,3
4,Wonderful Tonight,76,0.572,0.214,-15.625,0.0293,0.649,0.129,0.125,0.485,95.542,blues,4


In [17]:
df.shape

(10000, 13)

##### appending track ids to track name for distungishing the songs with similar names:

In [18]:
def convert_to_str(x):
    return str(x)

In [19]:
df['track_id1']=df['track_id'].apply(convert_to_str)
df.head()

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,track_id,track_id1
0,I See Red,77,0.509,0.448,-7.552,0.0357,0.00713,0.0137,0.244,0.221,156.909,blues,0,0
1,Cigarette Daydreams,79,0.636,0.676,-3.442,0.0263,0.0807,0.0,0.0831,0.273,113.98,blues,1,1
2,Play with Fire (feat. Yacht Money),76,0.573,0.539,-6.091,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,blues,2,2
3,Seven Nation Army,84,0.743,0.446,-7.807,0.0683,0.0039,0.118,0.337,0.278,123.596,blues,3,3
4,Wonderful Tonight,76,0.572,0.214,-15.625,0.0293,0.649,0.129,0.125,0.485,95.542,blues,4,4


In [20]:
df['track_name']=df['track_name']+' - '+df['track_id1']
df.head()

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,track_id,track_id1
0,I See Red - 0,77,0.509,0.448,-7.552,0.0357,0.00713,0.0137,0.244,0.221,156.909,blues,0,0
1,Cigarette Daydreams - 1,79,0.636,0.676,-3.442,0.0263,0.0807,0.0,0.0831,0.273,113.98,blues,1,1
2,Play with Fire (feat. Yacht Money) - 2,76,0.573,0.539,-6.091,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,blues,2,2
3,Seven Nation Army - 3,84,0.743,0.446,-7.807,0.0683,0.0039,0.118,0.337,0.278,123.596,blues,3,3
4,Wonderful Tonight - 4,76,0.572,0.214,-15.625,0.0293,0.649,0.129,0.125,0.485,95.542,blues,4,4


In [21]:
df=df.drop(['track_id1'],axis=1)
df.head()

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,track_id
0,I See Red - 0,77,0.509,0.448,-7.552,0.0357,0.00713,0.0137,0.244,0.221,156.909,blues,0
1,Cigarette Daydreams - 1,79,0.636,0.676,-3.442,0.0263,0.0807,0.0,0.0831,0.273,113.98,blues,1
2,Play with Fire (feat. Yacht Money) - 2,76,0.573,0.539,-6.091,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,blues,2
3,Seven Nation Army - 3,84,0.743,0.446,-7.807,0.0683,0.0039,0.118,0.337,0.278,123.596,blues,3
4,Wonderful Tonight - 4,76,0.572,0.214,-15.625,0.0293,0.649,0.129,0.125,0.485,95.542,blues,4


##### Evaluating track_genre column:

In [22]:
df['track_genre'].value_counts()

blues         1000
classical     1000
country       1000
electronic    1000
hip-hop       1000
jazz          1000
pop           1000
r-n-b         1000
reggae        1000
rock          1000
Name: track_genre, dtype: int64

#### Step 1.3: Using Label Encoder to convert the track_genre's string values to numerical data:

##### importing necessary library and initiating Label Encoder:

In [23]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

##### Converting the track_genre's string values into numerical values:

In [24]:
df['track_genre_num']=le.fit_transform(df['track_genre'])
df.head()

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,track_id,track_genre_num
0,I See Red - 0,77,0.509,0.448,-7.552,0.0357,0.00713,0.0137,0.244,0.221,156.909,blues,0,0
1,Cigarette Daydreams - 1,79,0.636,0.676,-3.442,0.0263,0.0807,0.0,0.0831,0.273,113.98,blues,1,0
2,Play with Fire (feat. Yacht Money) - 2,76,0.573,0.539,-6.091,0.0322,0.0162,1.2e-05,0.105,0.39,75.012,blues,2,0
3,Seven Nation Army - 3,84,0.743,0.446,-7.807,0.0683,0.0039,0.118,0.337,0.278,123.596,blues,3,0
4,Wonderful Tonight - 4,76,0.572,0.214,-15.625,0.0293,0.649,0.129,0.125,0.485,95.542,blues,4,0


#### Step 1.4: Using MinMax algorithm to normalize all the values:

In [25]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0, 100))

In [26]:
df.shape

(10000, 14)

##### shortlisting the columns (attributes) that are to be normalized:

In [27]:
target_columns=['popularity','danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo','track_genre_num']

#### Step 1.4.2 Measuring the range of the column (attribute) values that are subjected to normalization: (Data Distribution)

This also shows us that how the range of input variable values are distributed in the dataset, giving us an idea regarding the data distribution (numerically).

In [28]:
for x in target_columns:
    print(df[x].describe())
    print()

count    10000.0000
mean        28.0877
std         29.4978
min          0.0000
25%          0.0000
50%         16.0000
75%         56.0000
max        100.0000
Name: popularity, dtype: float64

count    10000.000000
mean         0.593863
std          0.167522
min          0.000000
25%          0.483000
50%          0.606000
75%          0.718000
max          0.975000
Name: danceability, dtype: float64

count    10000.000000
mean         0.574818
std          0.244921
min          0.000071
25%          0.416000
50%          0.610500
75%          0.769000
max          0.997000
Name: energy, dtype: float64

count    10000.000000
mean        -8.710318
std          5.533403
min        -41.531000
25%        -10.115000
50%         -7.145500
75%         -5.222000
max          0.681000
Name: loudness, dtype: float64

count    10000.000000
mean         0.075867
std          0.078859
min          0.000000
25%          0.034700
50%          0.046200
75%          0.077900
max          0.924000
Name

From the above output, it is evident that different attributes have values in different ranges. Hence feature scaling is necessary for a better Machine Learning model performance. We need to normalize the values in the scale of 0 to 100

##### Normalizing the data using MinMax scale:

In [29]:
df[target_columns]=scaler.fit_transform(df[target_columns])
df.head()

Unnamed: 0,track_name,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,track_id,track_genre_num
0,I See Red - 0,77.0,52.205128,44.930905,80.496067,3.863636,0.714837,1.395112,23.511294,22.2334,64.472906,blues,0,0.0
1,Cigarette Daydreams - 1,79.0,65.230769,67.80113,90.232635,2.84632,8.101459,0.0,6.991786,27.464789,46.833654,blues,1,0.0
2,Play with Fire (feat. Yacht Money) - 2,76.0,58.769231,54.058933,83.957169,3.484848,1.625489,0.001181,9.240246,39.235412,30.821952,blues,2,0.0
3,Seven Nation Army - 3,84.0,76.205128,44.730289,79.891974,7.391775,0.390536,12.016293,33.059548,27.967807,50.784807,blues,3,0.0
4,Wonderful Tonight - 4,76.0,58.666667,21.458831,61.371174,3.170996,65.160282,13.136456,11.293634,48.792757,39.257597,blues,4,0.0


##### Verifying whether the data is normalized or not:

In [30]:
for x in target_columns:
    print(df[x].describe())
    print()

count    10000.0000
mean        28.0877
std         29.4978
min          0.0000
25%          0.0000
50%         16.0000
75%         56.0000
max        100.0000
Name: popularity, dtype: float64

count    10000.000000
mean        60.908993
std         17.181724
min          0.000000
25%         49.538462
50%         62.153846
75%         73.641026
max        100.000000
Name: danceability, dtype: float64

count    10000.000000
mean        57.651721
std         24.567561
min          0.000000
25%         41.721049
50%         61.230956
75%         77.129775
max        100.000000
Name: energy, dtype: float64

count    10000.000000
mean        77.752019
std         13.108601
min          0.000000
25%         74.424334
50%         81.459064
75%         86.015825
max        100.000000
Name: loudness, dtype: float64

count    10000.000000
mean         8.210746
std          8.534570
min          0.000000
25%          3.755411
50%          5.000000
75%          8.430736
max        100.000000
Name

The data is normalized

### End of Step 1:

In this step, we have shortlisted the input variables (model attributes) that will be used in our machine learning models, performed data preprocessing that includes Categorical Encoding and Feature Scaling

#### saving the following dataframe in the form of pickles and .csv file

In [32]:
import joblib
df.to_pickle('Artifacts/unlabelled_dataframe.pkl')
df.to_csv('CSV_Files/unlabelled_data.csv',index=False)