# Capstone Jupyter Notebook 5 - Inspecting Individual Genres

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline

plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14

### Add genre to the main dataset by left-joining it on the song track id with a dataset that includes genres. The new set includes all the songs in the main dataset; songs from the other set will not be added. If a song from the main table doesn't match any song in the other table, its genre is left blank.

In [2]:
songs1 = pd.read_csv('./Spotify_Youtube_Clean.csv', index_col='ID')
songs2 = pd.read_csv('./Final_database_Unique.csv')

In [3]:
songs_merged = pd.merge(songs1, songs2, how='left', on='Track_id')

### Save the new table to a CSV file. It will be opened in Excel, where more genres can be added, unnecessary columns can be dropped, and duplicate songs can be removed.

In [4]:
songs_merged.to_csv('Combined_Tables.csv')

### Download the cleaned dataset

In [4]:
songs = pd.read_csv('Spotify_Youtube_Genre_Unique.csv', index_col='ID')

In [5]:
songs.Track_id.nunique(), songs.shape[0]

(11695, 11695)

In [6]:
songs.isnull().sum()

Track                 0
Artist                0
Track_id              0
Url_spotify           0
Album                 0
Album_type            0
Uri                   0
Genre_spotify         0
Genre                 0
Danceability          0
Energy                0
Key                   0
Loudness              0
Speechiness           0
Acousticness          0
Instrumentalness      0
Liveness              0
Valence               0
Tempo                 0
Duration_ms           0
Stream              254
Url_youtube         220
Title               220
Channel             220
Views               220
Likes               255
Comments            276
Description         427
Licensed            220
official_video      220
dtype: int64

In [7]:
drop_columns = list(songs.columns[-9:])
print(drop_columns)

['Url_youtube', 'Title', 'Channel', 'Views', 'Likes', 'Comments', 'Description', 'Licensed', 'official_video']


In [8]:
songs.Key.value_counts()

0     1392
7     1234
1     1220
2     1125
9     1101
5      994
11     947
8      853
4      845
6      824
10     806
3      354
Name: Key, dtype: int64

### Replace the `Key` column with dummy columns, and drop `Key_0` because it the most common key.

In [9]:
key_dummies = pd.get_dummies(songs.Key, prefix='Key', drop_first=True)
songs = pd.concat([songs, key_dummies], axis=1)
songs.drop('Key', axis=1, inplace=True)
songs.columns

Index(['Track', 'Artist', 'Track_id', 'Url_spotify', 'Album', 'Album_type',
       'Uri', 'Genre_spotify', 'Genre', 'Danceability', 'Energy', 'Loudness',
       'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Duration_ms', 'Stream', 'Url_youtube', 'Title',
       'Channel', 'Views', 'Likes', 'Comments', 'Description', 'Licensed',
       'official_video', 'Key_1', 'Key_2', 'Key_3', 'Key_4', 'Key_5', 'Key_6',
       'Key_7', 'Key_8', 'Key_9', 'Key_10', 'Key_11'],
      dtype='object')

In [10]:
songs.Genre.value_counts()

pop                 3191
rock                1446
latin               1423
hip hop             1264
else                1010
indie                482
rap                  417
dance/electronic     393
metal                380
country              354
r&b/soul             345
k-pop                238
funk                 216
house                177
boy band              89
reggaeton             69
reggae                60
trap                  43
opm                   40
jazz                  39
bolero                19
Name: Genre, dtype: int64

### Add dummy columns for each genre, and drop `Genre_pop` because it the most common genre. Leave the original `Genre` column because it will still be needed later.

In [37]:
genre_dummies = pd.get_dummies(songs.Genre, prefix='Genre')
songs = pd.concat([songs, genre_dummies], axis=1)
songs.drop('Genre_pop', axis=1, inplace=True)

In [12]:
songs.isnull().sum()

Track                       0
Artist                      0
Track_id                    0
Url_spotify                 0
Album                       0
Album_type                  0
Uri                         0
Genre_spotify               0
Genre                       0
Danceability                0
Energy                      0
Loudness                    0
Speechiness                 0
Acousticness                0
Instrumentalness            0
Liveness                    0
Valence                     0
Tempo                       0
Duration_ms                 0
Stream                    254
Url_youtube               220
Title                     220
Channel                   220
Views                     220
Likes                     255
Comments                  276
Description               427
Licensed                  220
official_video            220
Key_1                       0
Key_2                       0
Key_3                       0
Key_4                       0
Key_5     

In [13]:
songs_streams = pd.DataFrame(songs.dropna(subset=['Stream']))
songs_streams.drop(list(songs_streams.columns[20:29]), axis=1, inplace=True)
songs_streams.isnull().sum()

Track                     0
Artist                    0
Track_id                  0
Url_spotify               0
Album                     0
Album_type                0
Uri                       0
Genre_spotify             0
Genre                     0
Danceability              0
Energy                    0
Loudness                  0
Speechiness               0
Acousticness              0
Instrumentalness          0
Liveness                  0
Valence                   0
Tempo                     0
Duration_ms               0
Stream                    0
Key_1                     0
Key_2                     0
Key_3                     0
Key_4                     0
Key_5                     0
Key_6                     0
Key_7                     0
Key_8                     0
Key_9                     0
Key_10                    0
Key_11                    0
Genre_bolero              0
Genre_boy band            0
Genre_country             0
Genre_dance/electronic    0
Genre_else          

In [38]:
songs_streams.dtypes

Track                      object
Artist                     object
Track_id                   object
Url_spotify                object
Album                      object
Album_type                 object
Uri                        object
Genre_spotify              object
Genre                      object
Danceability              float64
Energy                    float64
Loudness                  float64
Speechiness               float64
Acousticness              float64
Instrumentalness          float64
Liveness                  float64
Valence                   float64
Tempo                     float64
Duration_ms                 int64
Stream                      int64
Key_1                       uint8
Key_2                       uint8
Key_3                       uint8
Key_4                       uint8
Key_5                       uint8
Key_6                       uint8
Key_7                       uint8
Key_8                       uint8
Key_9                       uint8
Key_10        

In [15]:
# Change Stream data type to int64. (int (int32) is not large enough.)
songs_streams['Stream'] = (songs_streams.Stream).astype('int64')
songs_streams.dtypes

Track                      object
Artist                     object
Track_id                   object
Url_spotify                object
Album                      object
Album_type                 object
Uri                        object
Genre_spotify              object
Genre                      object
Danceability              float64
Energy                    float64
Loudness                  float64
Speechiness               float64
Acousticness              float64
Instrumentalness          float64
Liveness                  float64
Valence                   float64
Tempo                     float64
Duration_ms                 int64
Stream                      int64
Key_1                       uint8
Key_2                       uint8
Key_3                       uint8
Key_4                       uint8
Key_5                       uint8
Key_6                       uint8
Key_7                       uint8
Key_8                       uint8
Key_9                       uint8
Key_10        

### This function takes a DataFrame, a list of input columns, and a regression type as inputs, creates a model to predict the number of streams a song has, and returns the RMSE of the model.

In [16]:
def get_rmse(df, x_cols, y_col, reg):
    X = df[x_cols]
    y = df[y_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    reg.fit(X_train, y_train)
    
    preds = reg.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, preds))

In [17]:
# Three regression types in the scikit-learn library
lin_reg = LinearRegression()
rand_for_reg = RandomForestRegressor()
kn_reg = KNeighborsRegressor()

In [18]:
feature_columns = list(songs_streams.describe().columns)
feature_columns.remove('Stream')
feature_columns

['Danceability',
 'Energy',
 'Loudness',
 'Speechiness',
 'Acousticness',
 'Instrumentalness',
 'Liveness',
 'Valence',
 'Tempo',
 'Duration_ms',
 'Key_1',
 'Key_2',
 'Key_3',
 'Key_4',
 'Key_5',
 'Key_6',
 'Key_7',
 'Key_8',
 'Key_9',
 'Key_10',
 'Key_11',
 'Genre_bolero',
 'Genre_boy band',
 'Genre_country',
 'Genre_dance/electronic',
 'Genre_else',
 'Genre_funk',
 'Genre_hip hop',
 'Genre_house',
 'Genre_indie',
 'Genre_jazz',
 'Genre_k-pop',
 'Genre_latin',
 'Genre_metal',
 'Genre_opm',
 'Genre_r&b/soul',
 'Genre_rap',
 'Genre_reggae',
 'Genre_reggaeton',
 'Genre_rock',
 'Genre_trap']

### Use the average number of streams in the test data as a baseline

In [19]:
X = songs_streams[feature_columns]
y = songs_streams.Stream

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

y_test_mean = y_test.mean()
print('Average number of streams in test data =', round(y_test_mean))

Average number of streams in test data = 183072091


In [20]:
baseline_series = [y_test_mean] * len(y_test)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_series))
best_rmse = baseline_rmse
print('Baseline RMSE =', round(best_rmse))

Baseline RMSE = 284295332


In [21]:
# This function returns and prints the smaller RMSE.
def new_best_rmse(rmse_new, rmse_old):
    print('RMSE for this model =', round(rmse_new))
    if rmse_new < rmse_old:
        print('This is now the best RMSE.')
        return rmse_new
    print('The best RMSE is still', round(rmse_old))
    return rmse_old

In [22]:
rmse = get_rmse(songs_streams, feature_columns, 'Stream', lin_reg)
best_rmse = new_best_rmse(rmse, best_rmse)

RMSE for this model = 277822397
This is now the best RMSE.


In [23]:
rmse = get_rmse(songs_streams, feature_columns, 'Stream', rand_for_reg)
best_rmse = new_best_rmse(rmse, best_rmse)

RMSE for this model = 282857434
The best RMSE is still 277822397


In [24]:
rmse = get_rmse(songs_streams, feature_columns, 'Stream', kn_reg)
best_rmse = new_best_rmse(rmse, best_rmse)

RMSE for this model = 305169985
The best RMSE is still 277822397


In [25]:
kn_reg = KNeighborsRegressor(n_neighbors=50)
rmse = get_rmse(songs_streams, feature_columns, 'Stream', kn_reg)
best_rmse = new_best_rmse(rmse, best_rmse)

RMSE for this model = 283659490
The best RMSE is still 277822397


In [26]:
songs2 = songs_streams[(songs_streams.Stream < 1.5e9) & (songs.Tempo > 0) & (songs.Tempo < 230)]
rmse = get_rmse(songs2, feature_columns, 'Stream', lin_reg)
best_rmse = new_best_rmse(rmse, best_rmse)

RMSE for this model = 231250408
This is now the best RMSE.


  songs2 = songs_streams[(songs_streams.Stream < 1.5e9) & (songs.Tempo > 0) & (songs.Tempo < 230)]


### The best model is a linear regression with some outliers removed and the genre included.

In [27]:
feature_columns_2 = feature_columns[0:21]
feature_columns_2

['Danceability',
 'Energy',
 'Loudness',
 'Speechiness',
 'Acousticness',
 'Instrumentalness',
 'Liveness',
 'Valence',
 'Tempo',
 'Duration_ms',
 'Key_1',
 'Key_2',
 'Key_3',
 'Key_4',
 'Key_5',
 'Key_6',
 'Key_7',
 'Key_8',
 'Key_9',
 'Key_10',
 'Key_11']

In [28]:
rmse = get_rmse(songs2, feature_columns_2, 'Stream', lin_reg)
best_rmse = new_best_rmse(rmse, best_rmse)

RMSE for this model = 234341760
The best RMSE is still 231250408


In [29]:
round(100 * (rmse - best_rmse) / rmse, 2)

1.32

### A regression with genres has a smaller RMSE that one without genres using the same dataset, but by only 1.32%

### Now separate the data into different sets based on genre and create a linear regresion model for each set.

In [30]:
genres = np.sort(songs2.Genre.unique())
genres

array(['bolero', 'boy band', 'country', 'dance/electronic', 'else',
       'funk', 'hip hop', 'house', 'indie', 'jazz', 'k-pop', 'latin',
       'metal', 'opm', 'pop', 'r&b/soul', 'rap', 'reggae', 'reggaeton',
       'rock', 'trap'], dtype=object)

In [36]:
for genre in genres:
    song_genre = songs2[songs.Genre == genre]
    genre_rmse = get_rmse(song_genre, feature_columns_2, 'Stream', lin_reg)
    print(f'{genre}:    \t{song_genre.shape[0]} songs,\tRMSE = {round(genre_rmse)},\t{round(100 * (best_rmse - genre_rmse) / best_rmse, 2)}% better than best RSME')

  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]


bolero:    	18 songs,	RMSE = 130292944,	43.66% better than best RSME
boy band:    	89 songs,	RMSE = 183340415,	20.72% better than best RSME
country:    	354 songs,	RMSE = 138432146,	40.14% better than best RSME
dance/electronic:    	373 songs,	RMSE = 188574379,	18.45% better than best RSME
else:    	958 songs,	RMSE = 126107412,	45.47% better than best RSME
funk:    	211 songs,	RMSE = 110805127,	52.08% better than best RSME
hip hop:    	1251 songs,	RMSE = 226590448,	2.02% better than best RSME
house:    	166 songs,	RMSE = 328642424,	-42.12% better than best RSME
indie:    	473 songs,	RMSE = 251371125,	-8.7% better than best RSME
jazz:    	39 songs,	RMSE = 141777413,	38.69% better than best RSME
k-pop:    	233 songs,	RMSE = 182343920,	21.15% better than best RSME
latin:    	1397 songs,	RMSE = 147609993,	36.17% better than best RSME
metal:    	375 songs,	RMSE = 199992892,	13.52% better than best RSME
opm:    	40 songs,	RMSE = 72916273,	68.47% better than best RSME
pop:    	3098 songs,	RMS

  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]
  song_genre = songs2[songs.Genre == genre]


### The models for some genres have lower RMSEs than the model for the entire set of songs, but other genres have higher RMSEs.