In [70]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib as plt
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np

## Importing data

In [50]:
attributes_df = pd.read_csv("../Resources/songAttributes_1999-2019.csv", encoding='unicode_escape')
attributes_df.drop(columns=["X"], inplace=True)
attributes_df["Popularity"].describe()
attributes_df.head()

Unnamed: 0,Acousticness,Album,Artist,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Name,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0.000728,Collective Soul (Deluxe Version),Collective Soul,0.52,234947,0.904,False,0.0103,0.0634,-5.03,1,Welcome All Again,35,0.0309,106.022,4,0.365
1,0.0182,Collective Soul (Deluxe Version),Collective Soul,0.581,239573,0.709,False,0.000664,0.174,-4.909,1,Fuzzy,31,0.0282,120.027,4,0.408
2,0.000473,Collective Soul (Deluxe Version),Collective Soul,0.572,198400,0.918,False,0.000431,0.0977,-3.324,0,Dig,30,0.0559,144.061,4,0.37
3,0.00097,Collective Soul (Deluxe Version),Collective Soul,0.596,231453,0.661,False,3.3e-05,0.113,-5.051,1,You,35,0.0254,111.975,4,0.183
4,3.6e-05,Collective Soul (Deluxe Version),Collective Soul,0.52,222520,0.808,False,1e-05,0.08,-4.553,0,My Days,21,0.0318,92.721,4,0.666


In [51]:
attributes_billboard = pd.read_csv("../Resources/attributesBillboard.csv")
attributes_billboard.drop(columns=["Unnamed: 0"], inplace=True)
attributes_billboard.head()

Unnamed: 0,Artist,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,Lyrics,...,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,jonas brothers,sucker,6,1.0,17.0,2019-07-06,"March 1, 2019","Alternative Pop,Boy Band,Teen Pop,Pop-Rock,Pop","Ryan tedder, Louis bell, Frank dukes, Nick jon...",Sucker \nWe go together\nBetter than birds of ...,...,False,0.0,0.106,-5.065,0,81,0.0588,137.958,4,0.952
1,taylor swift,you need to calm down,13,2.0,2.0,2019-07-06,"June 14, 2019","Synth-Pop,LGBTQ+,Pop","Joel little, Taylor swift",You Need To Calm Down \nYou are somebody that ...,...,False,0.0,0.0637,-5.617,1,89,0.0553,85.026,4,0.714
2,panic! at the disco,"hey look ma, i made it",24,24.0,11.0,2019-07-06,"June 22, 2018","Pop-Rock,Jazz Fusion,Alternative,Alternative P...","Jake sinclair, Michael angelakos, Dillon franc...","Hey Look Ma, I Made It \nAll my life, been hus...",...,False,0.0,0.121,-3.337,1,81,0.0695,107.936,4,0.58
3,lee brice,rumor,26,25.0,16.0,2019-07-06,"November 3, 2017",Country,"Kyle jacobs, Ashley gorley, Lee brice",Rumor \nGirl you know I've known you forever\n...,...,False,0.0,0.115,-6.857,1,79,0.0486,140.975,4,0.599
4,panic! at the disco,high hopes,32,4.0,47.0,2019-07-06,"May 23, 2018","Adult Alternative,Rock,Power Pop,Pop-Rock,Alte...","Sam hollander, Cook classics, Tayla parx, Jake...","High Hopes \nHigh, high hopes\nHad to have hig...",...,False,0.0,0.064,-2.729,1,87,0.0618,82.014,4,0.681


In [52]:
print(attributes_df['Popularity'].describe())

print(attributes_billboard['Popularity'].describe())


count    154931.000000
mean         20.249111
std          16.506651
min           0.000000
25%           6.000000
50%          17.000000
75%          31.000000
max          91.000000
Name: Popularity, dtype: float64
count    4774.000000
mean       39.847926
std        21.823721
min         0.000000
25%        23.000000
50%        42.000000
75%        58.000000
max        89.000000
Name: Popularity, dtype: float64


We are using the 75% to make the cut as 0= popular 1=Most popular 

In [53]:
# Making popularity as a binary variable in both DF
attributes_df['Popularity'] = np.where((attributes_df['Popularity'] > 31) & attributes_df['Popularity']
                                            .notna(), 1, 0)

attributes_billboard['Popularity'] = np.where((attributes_billboard['Popularity'] > 58) & attributes_billboard['Popularity']
                                            .notna(), 1, 0)

# Keras Sequential Model

## Attributes DF 

In [54]:
# Creating a sample for the second model
attributes_sample_df = attributes_df.sample(frac=0.3, random_state=78)

In [55]:
X = attributes_sample_df[['Valence', 'Energy', "Danceability", "Acousticness", "Tempo", "Duration", "Instrumentalness","Liveness","Loudness"]]
y = attributes_sample_df['Popularity']

In [56]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [57]:
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=9))

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 20)                200       
                                                                 
 dense_13 (Dense)            (None, 20)                420       
                                                                 
 dense_14 (Dense)            (None, 1)                 21        
                                                                 
Total params: 641 (2.50 KB)
Trainable params: 641 (2.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [58]:
# Compile the model and train
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn_model.fit(X_train_scaled, y_train, epochs=150)

Epoch 1/150


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 7

In [61]:
# Evaluate the performance of model using the loss and predictive accuracy of the model on the test dataset.
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

364/364 - 0s - loss: 0.5313 - accuracy: 0.7620 - 256ms/epoch - 703us/step
Loss: 0.5312685370445251, Accuracy: 0.7619621157646179


## Attributes Billboard DF

In [None]:
X = attributes_billboard[['Valence', 'Energy', "Danceability", "Acousticness", "Tempo", "Duration", "Instrumentalness","Liveness","Loudness"]]
y = attributes_billboard['Popularity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=9))

nn_model.add(tf.keras.layers.Dense(units=20, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 20)                200       
                                                                 
 dense_4 (Dense)             (None, 20)                420       
                                                                 
 dense_5 (Dense)             (None, 1)                 21        
                                                                 
Total params: 641 (2.50 KB)
Trainable params: 641 (2.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Compile the model and train
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn_model.fit(X_train_scaled, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
# Evaluate the performance of model using the loss and predictive accuracy of the model on the test dataset.
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

38/38 - 0s - loss: 0.6620 - accuracy: 0.7178 - 80ms/epoch - 2ms/step
Loss: 0.6619716882705688, Accuracy: 0.7177554368972778


# SVC Model


## Attributes Billboard DF

In [62]:
attributes_billboard

Unnamed: 0,Artist,Name,Weekly.rank,Peak.position,Weeks.on.chart,Week,Date,Genre,Writing.Credits,Lyrics,...,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,jonas brothers,sucker,6,1.0,17.0,2019-07-06,"March 1, 2019","Alternative Pop,Boy Band,Teen Pop,Pop-Rock,Pop","Ryan tedder, Louis bell, Frank dukes, Nick jon...",Sucker \nWe go together\nBetter than birds of ...,...,False,0.000000,0.1060,-5.065,0,1,0.0588,137.958,4,0.952
1,taylor swift,you need to calm down,13,2.0,2.0,2019-07-06,"June 14, 2019","Synth-Pop,LGBTQ+,Pop","Joel little, Taylor swift",You Need To Calm Down \nYou are somebody that ...,...,False,0.000000,0.0637,-5.617,1,1,0.0553,85.026,4,0.714
2,panic! at the disco,"hey look ma, i made it",24,24.0,11.0,2019-07-06,"June 22, 2018","Pop-Rock,Jazz Fusion,Alternative,Alternative P...","Jake sinclair, Michael angelakos, Dillon franc...","Hey Look Ma, I Made It \nAll my life, been hus...",...,False,0.000000,0.1210,-3.337,1,1,0.0695,107.936,4,0.580
3,lee brice,rumor,26,25.0,16.0,2019-07-06,"November 3, 2017",Country,"Kyle jacobs, Ashley gorley, Lee brice",Rumor \nGirl you know I've known you forever\n...,...,False,0.000000,0.1150,-6.857,1,1,0.0486,140.975,4,0.599
4,panic! at the disco,high hopes,32,4.0,47.0,2019-07-06,"May 23, 2018","Adult Alternative,Rock,Power Pop,Pop-Rock,Alte...","Sam hollander, Cook classics, Tayla parx, Jake...","High Hopes \nHigh, high hopes\nHad to have hig...",...,False,0.000000,0.0640,-2.729,1,1,0.0618,82.014,4,0.681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4769,mark wills,wish you were here,81,34.0,20.0,1999-07-12,,Country,"Debbie moore, Bill anderson, Skip ewing",Wish You Were Here \nThey kissed goodbye at th...,...,False,0.000000,0.3910,-14.857,1,0,0.0310,136.094,4,0.272
4770,mark wills,wish you were here,81,34.0,20.0,1999-07-12,,Country,"Debbie moore, Bill anderson, Skip ewing",Wish You Were Here \nThey kissed goodbye at th...,...,False,0.000000,0.1170,-10.234,1,0,0.0285,138.045,4,0.256
4771,clay walker,she's always right,85,74.0,6.0,1999-07-12,,Pop,,She's Always Right \nShe's Perfect for a cotto...,...,False,0.000000,0.1430,-10.020,1,0,0.0315,142.685,4,0.480
4772,collective soul,heavy,96,73.0,20.0,1999-07-12,,"Hockey,Gaming,Soundtrack,Rock",Collective soul,Heavy \nComplicate this world you wrapped for ...,...,False,0.000563,0.0839,-5.560,1,0,0.0371,106.923,4,0.548


In [64]:
scaled_data_arr = StandardScaler().fit_transform(
    attributes_billboard[[
        "Danceability",
        "Acousticness",
        "Energy",
        "Instrumentalness",
        "Liveness",
        "Duration",
        "Loudness",
        "Valence",
        "Speechiness"
    ]]
)

attributes_scaled_df= pd.DataFrame(scaled_data_arr, columns=[[
        "Danceability",
        "Acousticness",
        "Energy",
        "Instrumentalness",
        "Liveness",
        "Duration",
        "Loudness",
        "Valence",
        "Speechiness"
    ]])
attributes_scaled_df

Unnamed: 0,Danceability,Acousticness,Energy,Instrumentalness,Liveness,Duration,Loudness,Valence,Speechiness
0,1.640417,-0.598419,0.178807,-0.138798,-0.546187,-1.208851,0.356331,1.926640,-0.322868
1,1.143386,-0.756771,-0.186643,-0.138798,-0.814590,-1.422540,0.122208,0.857526,-0.359151
2,-0.214696,-0.735870,0.753087,-0.138798,-0.451009,-1.459964,1.089238,0.255589,-0.211946
3,0.331337,2.749215,-0.830533,-0.138798,-0.489080,-0.823936,-0.403720,0.340938,-0.428606
4,-0.200696,0.113955,1.164944,-0.138798,-0.812686,-0.989569,1.347112,0.709288,-0.291769
...,...,...,...,...,...,...,...,...,...
4769,-0.522715,0.839125,-2.170519,-0.138798,1.262197,0.470381,-3.796805,-1.127969,-0.611057
4770,-0.662724,0.099735,-1.770263,-0.138798,-0.476390,0.138076,-1.836026,-1.199842,-0.636973
4771,-1.040747,1.213559,-0.952350,-0.138798,-0.311415,-0.748182,-1.745261,-0.193618,-0.605874
4772,-0.424709,-0.796646,1.153342,-0.131373,-0.686416,-1.380386,0.146384,0.111843,-0.547821


In [65]:
# preparing the data for the model
popularity = attributes_billboard["Popularity"]
target_popularity=["Popular","Not popular"]
X = attributes_scaled_df

In [67]:
popularity.value_counts()

0    3641
1    1133
Name: Popularity, dtype: int64

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, popularity, random_state=42, stratify=popularity)

In [71]:
model = SVC(kernel="linear", random_state=42)
model.fit(X_train, y_train)

In [72]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.7625698324022346
Testing Data Score: 0.7629815745393634


In [73]:
# Calculate the classification report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_popularity))

              precision    recall  f1-score   support

     Popular       0.76      1.00      0.87       911
 Not popular       0.00      0.00      0.00       283

    accuracy                           0.76      1194
   macro avg       0.38      0.50      0.43      1194
weighted avg       0.58      0.76      0.66      1194



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
