In [1]:
################# Exercise Data Processing #################

import pandas as pd
df = pd.read_csv('ml-03-data-processing-songs-dataset.csv')
df.shape

(1994, 16)

In [2]:
# Number of null values in each column
df.isnull().sum()

Index                        0
Title                        3
Artist                       8
Top Genre                    8
Year                         7
Month                        0
Beats Per Minute (BPM)       9
Energy                       8
Danceability              1024
Loudness (dB)                7
Liveness                     8
Valence                   1034
Length (Duration)            9
Acousticness                 8
Speechiness                  4
Popularity                   7
dtype: int64

In [3]:
# Delete rows with 50% or more null values
for i in range(len(df)):
    null_values = df.loc[[i]].isna().sum().sum()
    if null_values >= len(df.columns)/2:
        df.drop([i],inplace=True)
df.shape

(1981, 16)

In [4]:
df.isnull().sum()

Index                        0
Title                        0
Artist                       0
Top Genre                    0
Year                         0
Month                        0
Beats Per Minute (BPM)       0
Energy                       0
Danceability              1011
Loudness (dB)                0
Liveness                     0
Valence                   1022
Length (Duration)            0
Acousticness                 0
Speechiness                  0
Popularity                   0
dtype: int64

In [5]:
# Delete columns with 50% or more null values
df.drop(['Danceability','Valence'],axis=1,inplace=True)
df.isna().sum()

Index                     0
Title                     0
Artist                    0
Top Genre                 0
Year                      0
Month                     0
Beats Per Minute (BPM)    0
Energy                    0
Loudness (dB)             0
Liveness                  0
Length (Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64

In [6]:
# Descriptive Statistics for Each Column
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,Index,Year,Beats Per Minute (BPM),Energy,Loudness (dB),Liveness,Acousticness,Speechiness,Popularity
count,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0
mean,996.035336,1959.439677,120.238768,59.64109,-9.018173,19.009086,28.933872,4.992428,59.499243
std,576.380399,251.024147,28.65264,22.177314,3.65213,16.737453,29.047404,4.411125,14.374101
min,1.0,92.0,15.0,3.0,-27.0,2.0,0.0,2.0,11.0
25%,496.0,1978.0,99.0,42.0,-11.0,9.0,3.0,3.0,49.0
50%,995.0,1994.0,119.0,61.0,-9.0,12.0,18.0,4.0,62.0
75%,1495.0,2007.0,136.0,78.0,-6.0,23.0,50.0,5.0,71.0
max,1994.0,2019.0,305.0,100.0,-2.0,99.0,99.0,55.0,100.0


In [7]:
# Find outliers in each column
import numpy as np
stats_cols = df.describe().columns

for i in range(len(stats_cols)):
    outlier_list = []
    three_std = 3 * df[stats_cols[i]].std()
    mean = df[stats_cols[i]].mean()
    for j in range(len(df)):
        col_value = df.at[j,stats_cols[i]]
        if col_value > (mean + three_std) or col_value < (mean - three_std):
            # Replace outliers with NAN
            df.at[j,stats_cols[i]] = np.nan
            outlier_list.append(str(col_value))
    outliers = ', '.join(outlier_list)
    if outlier_list != []:
        print(f'{stats_cols[i]} has {len(outlier_list)} outliers: {outliers}\n')
    else: 
        print(f'{stats_cols[i]} has no outliers!\n')



Index has no outliers!

Year has 35 outliers: 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0, 92.0

Beats Per Minute (BPM) has 4 outliers: 18.0, 305.0, 15.0, 297.0

Energy has no outliers!

Loudness (dB) has 22 outliers: -21.0, -20.0, -22.0, -22.0, -20.0, -20.0, -24.0, -20.0, -21.0, -22.0, -24.0, -27.0, -21.0, -21.0, -22.0, -21.0, -21.0, -20.0, -21.0, -22.0, -22.0, -21.0

Liveness has 57 outliers: 81.0, 70.0, 93.0, 79.0, 70.0, 97.0, 70.0, 83.0, 72.0, 97.0, 97.0, 87.0, 96.0, 71.0, 99.0, 92.0, 78.0, 95.0, 71.0, 90.0, 81.0, 95.0, 87.0, 91.0, 85.0, 89.0, 76.0, 73.0, 72.0, 85.0, 73.0, 86.0, 76.0, 73.0, 82.0, 77.0, 97.0, 85.0, 99.0, 78.0, 98.0, 96.0, 93.0, 85.0, 94.0, 77.0, 91.0, 96.0, 84.0, 97.0, 76.0, 73.0, 92.0, 80.0, 76.0, 83.0, 76.0

Acousticness has no outliers!

Speechiness has 42 outliers: 25.0, 26.0, 30.0, 23.0, 24.0, 19.0, 24.0,

In [8]:
# Check if outliers were removed
df.isnull().sum()

Index                      0
Title                      0
Artist                     0
Top Genre                  0
Year                      35
Month                      0
Beats Per Minute (BPM)     4
Energy                     0
Loudness (dB)             22
Liveness                  57
Length (Duration)          0
Acousticness               0
Speechiness               42
Popularity                 9
dtype: int64

In [9]:
# Year is a categorical column, so we impute the missing values with the mode:
from sklearn.impute import SimpleImputer
mode_impute = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
mode_impute.fit(df[['Year']])
df['Year'] = mode_impute.transform(df[['Year']])

# Check if NAN Year values were removed
df.isnull().sum()

Index                      0
Title                      0
Artist                     0
Top Genre                  0
Year                       0
Month                      0
Beats Per Minute (BPM)     4
Energy                     0
Loudness (dB)             22
Liveness                  57
Length (Duration)          0
Acousticness               0
Speechiness               42
Popularity                 9
dtype: int64

In [10]:
# The other missing values are continuous, so we use impute the values using the mean:
mean_impute = SimpleImputer(missing_values = np.nan, strategy='mean')
mean_impute.fit(df[['Beats Per Minute (BPM)','Loudness (dB)','Liveness','Speechiness','Popularity']])
df[['Beats Per Minute (BPM)','Loudness (dB)','Liveness','Speechiness','Popularity']] = mean_impute.transform(df[['Beats Per Minute (BPM)','Loudness (dB)','Liveness','Speechiness','Popularity']])

# Check if outliers were removed
df.isnull().sum()

Index                     0
Title                     0
Artist                    0
Top Genre                 0
Year                      0
Month                     0
Beats Per Minute (BPM)    0
Energy                    0
Loudness (dB)             0
Liveness                  0
Length (Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64

In [11]:
# Combine Month and Year Columns into Date Column
import datetime
df['Date'] = df['Month'] + '-' + df['Year'].astype(int).astype(str)
df['Date'] = pd.to_datetime(df['Date'],infer_datetime_format=True)


# Convert Today's Date into a Timestamp object
todays_date_timestamp = pd.to_datetime(datetime.datetime.now()) 


# Create Age Column by Subtracting Today's Date from df.Date
# Convert to Number of Days using timedelta64, then Divide by 365 to get Age in Years
df['Age'] = (todays_date_timestamp - df['Date']).astype('timedelta64[D]')/365 


# Drop Every Time-related column except Age
df.drop(['Date','Month','Year'],axis=1,inplace=True)

df.columns

Index(['Index', 'Title', 'Artist', 'Top Genre', 'Beats Per Minute (BPM)',
       'Energy', 'Loudness (dB)', 'Liveness', 'Length (Duration)',
       'Acousticness', 'Speechiness', 'Popularity', 'Age'],
      dtype='object')

In [12]:
# Preview of Age Column
df.head(5)

Unnamed: 0,Index,Title,Artist,Top Genre,Beats Per Minute (BPM),Energy,Loudness (dB),Liveness,Length (Duration),Acousticness,Speechiness,Popularity,Age
0,1,Sunrise,Norah Jones,adult standards,157.0,30.0,-14.0,11.0,201,94.0,3.0,71.0,17.578082
1,2,Black Night,Deep Purple,album rock,135.0,79.0,-11.0,17.0,207,17.0,7.0,39.0,22.167123
2,3,Clint Eastwood,Gorillaz,alternative hip hop,168.0,69.0,-9.0,7.0,341,2.0,17.0,69.0,20.915068
3,4,The Pretender,Foo Fighters,alternative metal,173.0,96.0,-4.0,3.0,269,0.0,4.0,76.0,15.326027
4,5,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,106.0,82.0,-5.0,10.0,256,1.0,3.0,59.0,19.747945


In [13]:
# Create Dummy Variables
# df.iloc[row_start:row_end , col_start,col_end]

# Get K-1 Dummies using drop_first = True 
df = pd.get_dummies(df, columns=['Title','Artist','Top Genre'], drop_first=True)
df.head()

Unnamed: 0,Index,Beats Per Minute (BPM),Energy,Loudness (dB),Liveness,Length (Duration),Acousticness,Speechiness,Popularity,Age,...,Top Genre_reggae,Top Genre_reggae fusion,Top Genre_rock-and-roll,Top Genre_scottish singer-songwriter,Top Genre_soft rock,Top Genre_stomp and holler,Top Genre_streektaal,Top Genre_trance,Top Genre_uk pop,Top Genre_yacht rock
0,1,157.0,30.0,-14.0,11.0,201,94.0,3.0,71.0,17.578082,...,0,0,0,0,0,0,0,0,0,0
1,2,135.0,79.0,-11.0,17.0,207,17.0,7.0,39.0,22.167123,...,0,0,0,0,0,0,0,0,0,0
2,3,168.0,69.0,-9.0,7.0,341,2.0,17.0,69.0,20.915068,...,0,0,0,0,0,0,0,0,0,0
3,4,173.0,96.0,-4.0,3.0,269,0.0,4.0,76.0,15.326027,...,0,0,0,0,0,0,0,0,0,0
4,5,106.0,82.0,-5.0,10.0,256,1.0,3.0,59.0,19.747945,...,0,0,0,0,0,0,0,0,0,0


In [14]:
################# Exercise Model Training #################

# Target Variable is Popularity
from sklearn.model_selection import train_test_split
 
X, y = df.loc[:, df.columns != 'Popularity'], df.Popularity

#Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [15]:
X_train.describe()

Unnamed: 0,Index,Beats Per Minute (BPM),Energy,Loudness (dB),Liveness,Acousticness,Speechiness,Age,Title_(Everything I Do) I Do It For You,Title_(I Can't Get No) Satisfaction - Mono Version,...,Top Genre_reggae,Top Genre_reggae fusion,Top Genre_rock-and-roll,Top Genre_scottish singer-songwriter,Top Genre_soft rock,Top Genre_stomp and holler,Top Genre_streektaal,Top Genre_trance,Top Genre_uk pop,Top Genre_yacht rock
count,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,...,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0
mean,997.313853,119.71475,59.41342,-8.918948,17.13178,29.160173,4.49568,28.595038,0.000722,0.0,...,0.004329,0.002886,0.000722,0.000722,0.000722,0.000722,0.000722,0.000722,0.000722,0.0
std,577.096931,27.553025,22.060928,3.417642,12.529708,29.429331,2.497758,16.215911,0.026861,0.0,...,0.065676,0.053663,0.026861,0.026861,0.026861,0.026861,0.026861,0.026861,0.026861,0.0
min,1.0,54.0,3.0,-19.0,2.0,0.0,2.0,2.40274,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,501.25,99.0,43.0,-11.0,9.0,3.0,3.0,13.994521,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1004.5,119.0,61.0,-8.876978,12.0,18.0,4.0,27.005479,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1504.75,135.0,77.0,-6.0,21.0,51.0,5.0,42.931507,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1994.0,206.0,100.0,-2.0,69.0,98.0,18.0,64.112329,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [16]:
X_test.describe()

Unnamed: 0,Index,Beats Per Minute (BPM),Energy,Loudness (dB),Liveness,Acousticness,Speechiness,Age,Title_(Everything I Do) I Do It For You,Title_(I Can't Get No) Satisfaction - Mono Version,...,Top Genre_reggae,Top Genre_reggae fusion,Top Genre_rock-and-roll,Top Genre_scottish singer-songwriter,Top Genre_soft rock,Top Genre_stomp and holler,Top Genre_streektaal,Top Genre_trance,Top Genre_uk pop,Top Genre_yacht rock
count,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,...,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0,595.0
mean,993.057143,121.2,60.171429,-8.779212,16.924913,28.406723,4.457692,28.562712,0.0,0.001681,...,0.003361,0.0,0.0,0.001681,0.0,0.0,0.0,0.0,0.0,0.001681
std,575.181564,28.610916,22.455812,3.345946,11.658059,28.154922,2.406129,16.347556,0.0,0.040996,...,0.057928,0.0,0.0,0.040996,0.0,0.0,0.0,0.0,0.0,0.040996
min,3.0,37.0,5.0,-19.0,3.0,0.0,2.0,2.652055,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,485.0,100.5,42.0,-11.0,9.0,4.0,3.0,14.076712,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,968.0,119.0,61.0,-8.0,13.0,18.0,4.0,27.419178,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1475.0,137.5,80.0,-6.0,21.0,48.5,5.0,42.428767,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1993.0,197.0,99.0,-2.0,69.0,99.0,18.0,65.778082,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
y_train.describe()

count    1386.000000
mean       59.693532
std        13.975619
min        17.000000
25%        50.000000
50%        62.000000
75%        71.000000
max       100.000000
Name: Popularity, dtype: float64

In [18]:
y_test.describe()

count    595.000000
mean      59.734652
std       14.214528
min       17.000000
25%       50.000000
50%       62.000000
75%       70.000000
max       87.000000
Name: Popularity, dtype: float64