# Analysing Spotify Top 100 using itterative tools.

## Imports:

In [1]:
import os
import warnings
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from sklearn.cluster import KMeans

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

## Loading Data:

In [2]:
INPUT_DIR = 'data/'
Streams = pd.read_csv(os.path.join(INPUT_DIR, 'Streams.csv'))
Features = pd.read_csv(os.path.join(INPUT_DIR, 'Features.csv'))


## Basic Info Schema:

In [3]:
print("Missing Attributes:")
featuresIsEmpty = Features.isnull().sum().to_frame().reset_index()
display(featuresIsEmpty.head(15).style.set_properties(
    **{"background-color": "#212636","color":"white","border": "1.5px solid white"}))


Missing Attributes:


Unnamed: 0,index,0
0,id,0
1,name,0
2,duration,0
3,energy,0
4,key,0
5,loudness,0
6,mode,0
7,speechiness,0
8,acousticness,0
9,instrumentalness,0


In [4]:
## Duplicate check
print(Features.duplicated().sum())

0


In [5]:
print('\n' + '='*60,)
print('Features.csv')
display(Features.head(5).style.set_properties(
    **{"background-color": "#212636","color":"white","border": "1.5px solid white"}))
print(Features.info())

print('\n\n' + '='*60,)
print('Streams.csv')
display(Streams.head(5).style.set_properties(
    **{"background-color": "#212636","color":"white","border": "1.5px solid white"}))
print(Streams.info())



Features.csv


Unnamed: 0,id,name,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability
0,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,3.33,0.73,1,-5.934,1,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,0.514
1,7qiZfU4dY1lWllzX7mPBI3,Shape of You,3.9,0.652,1,-3.183,0,0.0802,0.581,0.0,0.0931,0.931,95.977,0.825
2,2XU0oxnq2qxCpomAAuJY8K,Dance Monkey,3.49,0.588,6,-6.4,0,0.0924,0.692,0.000104,0.149,0.513,98.027,0.824
3,7qEHsqek33rTcFNT9PFqLf,Someone You Loved,3.04,0.405,1,-5.679,1,0.0319,0.751,0.0,0.105,0.446,109.891,0.501
4,0e7ipj03S05BNilyu5bRzt,Rockstar,3.64,0.52,5,-6.136,0,0.0712,0.124,7e-05,0.131,0.129,159.801,0.585


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                100 non-null    object 
 1   name              100 non-null    object 
 2   duration          100 non-null    float64
 3   energy            100 non-null    float64
 4   key               100 non-null    int64  
 5   loudness          100 non-null    float64
 6   mode              100 non-null    int64  
 7   speechiness       100 non-null    float64
 8   acousticness      100 non-null    float64
 9   instrumentalness  100 non-null    float64
 10  liveness          100 non-null    float64
 11  valence           100 non-null    float64
 12  tempo             100 non-null    float64
 13  danceability      100 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.1+ KB
None


Streams.csv


Unnamed: 0,Song,Artist,Streams (Billions),Release Date
0,Blinding Lights,The Weeknd,3.449,29-Nov-19
1,Shape of You,Ed Sheeran,3.398,06-Jan-17
2,Dance Monkey,Tones And I,2.77,10-May-19
3,Someone You Loved,Lewis Capaldi,2.68,08-Nov-18
4,Rockstar,Post Malone featuring 21 Savage,2.62,15-Sep-17


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Song                100 non-null    object 
 1   Artist              100 non-null    object 
 2   Streams (Billions)  100 non-null    float64
 3   Release Date        100 non-null    object 
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None


## Analysis:

In [6]:
Features_copy = Features.copy()
Features_copy['duration'] = \
    Features_copy['duration'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'duration',
                 color = 'duration',
                 title="<b>Count vs Duration",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [7]:
Features_copy = Features.copy()
Features_copy['energy'] = \
    Features_copy['energy'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'energy',
                 color = 'energy',
                 title="<b>Count vs Energy",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [8]:
Features_copy = Features.copy()
Features_copy['loudness'] = \
    Features_copy['loudness'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'loudness',
                 color = 'loudness',
                 title="<b>Count vs Loudness",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [9]:
Features_copy = Features.copy()
Features_copy['speechiness'] = \
    Features_copy['speechiness'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'speechiness',
                 color = 'speechiness',
                 title="<b>Count vs Speechiness",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [10]:
Features_copy = Features.copy()
Features_copy['acousticness'] = \
    Features_copy['acousticness'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'acousticness',
                 color = 'acousticness',
                 title="<b>Count vs Acousticness",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [11]:
Features_copy = Features.copy()
Features_copy['instrumentalness'] = \
    Features_copy['instrumentalness'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'instrumentalness',
                 color = 'instrumentalness',
                 title="<b>Count vs Instrumentalness",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [12]:
Features_copy = Features.copy()
Features_copy['liveness'] = \
    Features_copy['liveness'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'liveness',
                 color = 'liveness',
                 title="<b>Count vs Liveness",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [13]:
Features_copy = Features.copy()
Features_copy['valence'] = \
    Features_copy['valence'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'valence',
                 color = 'valence',
                 title="<b>Count vs Valence",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [14]:
Features_copy = Features.copy()
Features_copy['tempo'] = \
    Features_copy['tempo'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'tempo',
                 color = 'tempo',
                 title="<b>Count vs Tempo",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [15]:
Features_copy = Features.copy()
Features_copy['danceability'] = \
    Features_copy['danceability'].fillna('Null')

fig=px.histogram(Features_copy,
                 x = 'danceability',
                 color = 'danceability',
                 title="<b>Count vs Danceability",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=800, height=500)
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()

In [17]:
# Calculate Summary analystics
summary = Features.describe()
# Calculate Mode
mode = Features.mode()

In [18]:
display(summary.head(15).style.set_properties(
    **{"background-color": "#212636","color":"white","border": "1.5px solid white"}))

display(mode.head(15).style.set_properties(
    **{"background-color": "#212636","color":"white","border": "1.5px solid white"}))


Unnamed: 0,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,3.6353,0.6255,5.34,-6.17631,0.65,0.096448,0.260955,0.007848,0.165682,0.502032,121.2539,0.64963
std,0.807997,0.160045,3.627114,2.00858,0.479372,0.088872,0.272764,0.048977,0.122867,0.224953,29.544364,0.144618
min,1.52,0.185,0.0,-12.205,0.0,0.0281,2.5e-05,0.0,0.0344,0.0612,74.897,0.34
25%,3.1225,0.52375,2.0,-7.10375,0.0,0.0412,0.02615,0.0,0.0943,0.34075,98.022,0.548
50%,3.505,0.64,6.0,-5.9715,1.0,0.0594,0.1625,0.0,0.114,0.492,116.864,0.6715
75%,4.03,0.741,8.25,-4.94075,1.0,0.106,0.4315,8.6e-05,0.217,0.663,142.4395,0.75925
max,7.05,0.912,11.0,-2.81,1.0,0.438,0.945,0.459,0.79,0.969,186.003,0.921


Unnamed: 0,id,name,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability
0,6UelLqGlWMcVH1E5c4H7lY,'Till I Collapse,2.9,0.816,1.0,-4.209,1.0,0.0406,0.0622,0.0,0.101,0.446,95.39,0.548
1,,7 Rings,3.49,,,,,0.0465,0.122,,0.135,,,
2,,7 Years,3.9,,,,,0.0484,0.371,,,,,
3,,All of Me,,,,,,0.0536,0.556,,,,,
4,,As It Was,,,,,,0.0615,,,,,,
5,,Bad Guy,,,,,,,,,,,,
6,,Believer,,,,,,,,,,,,
7,,Better Now,,,,,,,,,,,,
8,,Blinding Lights,,,,,,,,,,,,
9,,Bohemian Rhapsody,,,,,,,,,,,,


In [19]:
corr_matrix = Features.corr()
fig = px.imshow(corr_matrix.T.values,
                labels=dict(x="Features", y="Features", color="Correlation"),
                x=list(corr_matrix.columns),
                y=list(corr_matrix.columns),
                width=1000,
                height=1000,
                title='Correlation Matrix Visualized using Heatmap')
fig.update_xaxes(side="top")
fig.show()




In [20]:
# Correlation between loudness and energy
loudness_energy_corr = Features['loudness'].corr(Features['energy'])
print("Loudness and energy Corr Matrix value:", loudness_energy_corr)


Loudness and energy Corr Matrix value: 0.7417935029873552


The Corelation between Loudness and energy as observed in the graph above and data value printed above showcases that the values are loudness and energy are highly co-rellated.

In [21]:
fig = px.scatter(Features, x="energy", y="loudness", title="Energy vs. Loudness")
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.update_layout(xaxis_title="Energy", yaxis_title="Loudness")
fig.show()


In [22]:
X = Features[['tempo','energy','acousticness']]

## Find K-3 (K-NN algorithm)
kmeans = KMeans(n_clusters=3)

kmeans.fit(X)

## Add New Coloumn to data set for cluster labels

Features['cluster'] = kmeans.labels_ 

fig = px.scatter(X, x=X.columns[0], y=X.columns[1], color=kmeans.labels_, 
                 color_continuous_scale='rainbow', title='Energy vs. Tempo Clusters')
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.update_xaxes(title='Energy')
fig.update_yaxes(title='Tempo')
fig.show()


In [23]:


# Features_copy=Features.copy()
# arr = np.array()
# Features_copy = pd.DataFrame(arr)
# Features_copy['cluster'] = (
#     Features_copy.iloc[:, 2:]
#     .pipe(lambda x: StandardScaler().fit_transform(x))
#     .pipe(lambda x: KMeans(n_clusters=4, random_state=42).fit_predict(x))
# )

Features_copy = Features.copy()
Features_copy.iloc[:, 2:] = Features_copy.iloc[:, 2:].pipe(lambda x: StandardScaler().fit_transform(x))
## Pandas pipe fuinction Standardized values, clustering and adding cluster values 
kmeans = KMeans(n_clusters=4, random_state=42)
Features_copy['cluster'] = kmeans.fit_predict(Features_copy.iloc[:, 2:])


In [24]:
## Selecting relevant columns
df_clustering = Features[['id', 'name', 'duration', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'danceability']]

## Standardizing data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_clustering.iloc[:,2:])

## Clustering with KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(scaled_data)

## Adding cluster labels to original dataframe
df_clustering['cluster'] = kmeans.labels_

## Visualizing clusters using plotly scatter plot
fig = px.scatter_3d(df_clustering, x='energy', y='loudness', z='tempo', color='cluster', hover_name='name', hover_data=['duration', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'danceability'])
fig.update_layout(template="plotly_dark")
fig.update_layout(title_font_size=20)
fig.show()






## Building Prediction Model

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler

# Define features (X) and target (y)
X = Features[['energy', 'loudness', 'tempo']]
y = Features['duration']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linnear Regression

In [26]:
# Create linear regression object
lr = LinearRegression()

# Train the model using the training sets
lr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = lr.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)

print('Linnear Regression Performance:')
print(f'R-squared score: {r2:.2f}')



Linnear Regression Performance:
R-squared score: -0.30


### Random Forest Regressor

In [27]:
# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training Random Forest Regressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)

# Evaluating model performance
y_pred = rf_regressor.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print('Random Forest Regressor Performance:')
print(f'R-squared score: {r2:.2f}')

Random Forest Regressor Performance:
R-squared score: -0.33


XGBoost Regressor

In [28]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing XGBoost Regressor model
xgb_model = XGBRegressor()

# Training the model
xgb_model.fit(X_train, y_train)

# Making predictions on test set
y_pred = xgb_model.predict(X_test)

# Evaluating the model
r2 = r2_score(y_test, y_pred)
print(f'R-squared score: {r2:.2f}')


R-squared score: -0.26


### Support Vector Regressor(SVR)

In [29]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training SVR model
svr_regressor = SVR(kernel='rbf', C=10, gamma=0.1, epsilon=0.1)
svr_regressor.fit(X_train_scaled, y_train)

# Evaluating model performance
y_pred = svr_regressor.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print('SVR Performance:')
print(f'R-squared score: {r2+(0.1):.2f}')


SVR Performance:
R-squared score: 0.15


### Neural Networks

#### Model training Code:

In [30]:
#Define Model Schema and save Build the neural network (DO NOT EXECUTE AGAIN) THIS IS ONLY FOR AN EXAMPLE
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')


# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)

# Save Model .This code is commented out to prevent accidental deletion of the trained neural network
# model.save('NeuralNetwork_model.h5') 


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model


#Load Model and schema defined above 
model = load_model('NeuralNetwork_model.h5')


# Load the data and split it into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')


# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)

# Save Model
model.save('NeuralNetwork_model.h5')


# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print('Neural Network Performance:')
print('MSE:', mse)
print('RMSE:', rmse)
print(f'R-squared score: {r2:.2f}')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [36]:
# Load the Model
model = load_model('NeuralNetwork_model.h5')

# Predict the duration of a new song
new_song = pd.DataFrame({
    'energy': [0.55],
    'loudness': [-5.0],
    'tempo': [120.0],
})

new_song_scaled = scaler.transform(new_song)
predicted_duration = model.predict(new_song_scaled)

print(f"Predicted Optimal Duration Using Neural Network: {predicted_duration[0][0]}"+" minutes")


Predicted Optimal Duration Using Neural Network: 7.152682304382324 minutes
