In [36]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

selected_features = ["duration_ms_x","danceability","energy", "instrumentalness", "liveness", "valence", "speechiness", "tempo"] 

SUCCESS_FEATURE = 'popularity'
SUCCESS_THRESHOLD = 0.75
MILD_SUCCESS_THRESHOLD = 0.4

df = pd.read_csv("chaabi.csv")
df = df.sort_values(by=['popularity'])
df = df.drop_duplicates(subset=['id'])
df = df.drop_duplicates(subset=['name', 'main_artist'], keep='first')
df = df.drop_duplicates(subset=selected_features, keep='first')


In [37]:
df.describe()

Unnamed: 0,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,instrumentalness,liveness,valence,speechiness,tempo,popularity,youtube_views
count,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0
mean,5.075157,0.58142,3.757829,460286.4,0.517825,-8.917685,0.601087,0.120644,0.24471,0.775332,0.058222,119.205842,2.834029,206025.7
std,3.628891,0.493584,0.504749,443147.2,0.117708,3.707884,0.185415,0.267462,0.185106,0.156906,0.035857,25.53941,4.714482,871815.7
min,0.0,0.0,1.0,69984.0,0.227,-27.767,0.000246,0.0,0.0216,0.14,0.0269,50.275,0.0,0.0
25%,2.0,0.0,3.0,233389.8,0.427,-10.7205,0.466,1.6e-05,0.113,0.692,0.0381,102.15875,0.0,330.5
50%,5.0,1.0,4.0,340008.0,0.5175,-8.72,0.587,0.000493,0.1805,0.812,0.04705,113.075,1.0,3655.0
75%,8.0,1.0,4.0,474962.8,0.602,-6.49675,0.73675,0.03015,0.333,0.89775,0.064425,136.457,4.0,41771.75
max,11.0,1.0,5.0,4388522.0,0.85,2.917,0.983,0.973,0.952,0.993,0.421,209.148,35.0,14320230.0


## Feature engineering

In [38]:
df['popularity_cat'] = "Unsuccesful"
# TODO Use spotify, Youtube and Last FM 
df.loc[(df[SUCCESS_FEATURE] > df[SUCCESS_FEATURE].quantile(SUCCESS_THRESHOLD)), 'popularity_cat'] = "Succesful"
df.loc[(df[SUCCESS_FEATURE] <= df[SUCCESS_FEATURE].quantile(SUCCESS_THRESHOLD)) & (df[SUCCESS_FEATURE] > df[SUCCESS_FEATURE].quantile(MILD_SUCCESS_THRESHOLD)), 'popularity_cat'] = "Mildly Succesful"

## Split & prepare the data

In [39]:
train, test = train_test_split(df, test_size=0.2)

selected_target = "popularity_cat"

train_x = train[selected_features] 
train_y = train[selected_target]

test_x = test[selected_features] 
test_y = test[selected_target]

## Building the model

In [40]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.dummy import DummyClassifier
forest = RandomForestClassifier(max_depth=5, min_samples_leaf=2, n_estimators=50)
dummy_clf = DummyClassifier(strategy='stratified')

In [41]:
forest.fit(X = train_x, y = train_y)
dummy_clf.fit(X = train_x, y = train_y)

DummyClassifier(strategy='stratified')

In [42]:
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
rf_predict = forest.predict(test_x)
train_predict = forest.predict(train_x)
dummy_predict = dummy_clf.predict(test_x)
print("Training accuracy: " + str(accuracy_score(train_y, train_predict)))
print("Testing accuracy: " + str(accuracy_score(test_y, rf_predict)))
print("Dummy accuracy: " + str(accuracy_score(test_y, dummy_predict)))

Training accuracy: 0.6449086161879896
Testing accuracy: 0.5572916666666666
Dummy accuracy: 0.4270833333333333


In [43]:
for name, score in zip(selected_features, forest.feature_importances_):
    print(name, score)

duration_ms_x 0.16451103956462318
danceability 0.09814895736717424
energy 0.15171401081081323
instrumentalness 0.1094485246640321
liveness 0.06986313547785448
valence 0.1022936143761152
speechiness 0.20009168781755254
tempo 0.10392902992183509


In [44]:
test["prediction"] = rf_predict
test.sample(5)

Unnamed: 0,id,name,main_artist,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,instrumentalness,liveness,valence,speechiness,tempo,popularity,youtube_views,popularity_cat,prediction
511,6jlN6txUOtro1Ir6eRVEQw,Klam El Aare,Naima Dziria,0,1,4,164362,0.32,-8.876,0.952,1e-06,0.706,0.858,0.0645,155.429,3,105,Mildly Succesful,Mildly Succesful
174,3rLn08zyJW0nCwodk9dxFU,Ouryenfagh manahder - Kabyle,Kamel Messaoudi,10,1,4,442524,0.66,-8.943,0.626,0.000426,0.131,0.726,0.112,136.461,10,735865,Succesful,Unsuccesful
536,7hVzCDmnxC0EBFbjKLMyWB,Awah ya dmaghi,El Hachemi Guerouabi,11,1,4,662835,0.497,-11.969,0.331,0.0646,0.124,0.674,0.0636,119.234,2,111221,Mildly Succesful,Unsuccesful
1061,1LSc7wU36U9I2Jq30NRTYY,Youm el djemaâ,Cheikh el Hasnaoui,3,0,3,335987,0.444,-8.91,0.476,0.0,0.369,0.671,0.0937,82.088,0,23254,Unsuccesful,Unsuccesful
1398,4kaLtYtExZPOBZ9FBpsifj,El Mektoub,El Hachemi Guerouabi,2,1,4,440293,0.467,-15.563,0.348,9.2e-05,0.246,0.666,0.0562,112.503,0,198451,Unsuccesful,Unsuccesful
