In [81]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

selected_features = ["duration_ms_x","danceability", "loudness", "energy", "instrumentalness", "liveness", "valence", "speechiness", "tempo"] 

SUCCESS_FEATURE = 'youtube_views'
SUCCESS_THRESHOLD = 0.75
MILD_SUCCESS_THRESHOLD = 0.4

df = pd.read_csv("rai.csv")
df = df.sort_values(by=['popularity'])
df = df.drop_duplicates(subset=['id'])
df = df.drop_duplicates(subset=['name', 'main_artist'], keep='first')
df = df.drop_duplicates(subset=selected_features, keep='first')

## Feature engineering

### Feature scaling and centering

In [82]:
df.youtube_views = np.log1p(df.youtube_views)
df.last_fm_playcount = np.log1p(df.last_fm_playcount)

### Defining popularity

In [83]:
df['popularity_cat'] = "Unsuccesful"
# TODO Use spotify, Youtube and Last FM 
df.loc[(df[SUCCESS_FEATURE] > df[SUCCESS_FEATURE].quantile(SUCCESS_THRESHOLD)), 'popularity_cat'] = "Succesful"
df.loc[(df[SUCCESS_FEATURE] <= df[SUCCESS_FEATURE].quantile(SUCCESS_THRESHOLD)) & (df[SUCCESS_FEATURE] >= df[SUCCESS_FEATURE].quantile(MILD_SUCCESS_THRESHOLD)), 'popularity_cat'] = "Mildly Succesful"

## Split & prepare the data

In [84]:
train, test = train_test_split(df, test_size=0.2)


selected_target = "popularity_cat"

train_x = train[selected_features] 
train_y = train[selected_target]

test_x = test[selected_features] 
test_y = test[selected_target]

## Building the model

In [85]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.dummy import DummyClassifier
forest = RandomForestClassifier(max_depth=5, min_samples_leaf=3, n_estimators=75)
dummy_clf = DummyClassifier()

In [86]:
forest.fit(X = train_x, y = train_y)
dummy_clf.fit(X = train_x, y = train_y)

DummyClassifier()

In [87]:
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
rf_predict = forest.predict(test_x)
train_predict = forest.predict(train_x)
dummy_predict = dummy_clf.predict(test_x)
print("Training accuracy: " + str(accuracy_score(train_y, train_predict)))
print("Testing accuracy: " + str(accuracy_score(test_y, rf_predict)))
print("Dummy accuracy: " + str(accuracy_score(test_y, dummy_predict)))

Training accuracy: 0.6586921850079744
Testing accuracy: 0.4554140127388535
Dummy accuracy: 0.3375796178343949


In [88]:
test["prediction"] = rf_predict
test.sample(10)

Unnamed: 0,id,name,main_artist,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,...,liveness,valence,speechiness,tempo,popularity,youtube_views,last_fm_playcount,last_fm_listeners,popularity_cat,prediction
1551,5cORzMWKkLUlwA1OGGcemf,عم سينا,Cheb Hasni,7,1,4,323133,0.664,-9.562,0.689,...,0.121,0.833,0.0523,162.694,2,3.332205,1.609438,3,Unsuccesful,Unsuccesful
2281,6ZTxpl2s2iqtvg4VVPNm3s,Eachqek Historique,Cheb Houssem,7,1,4,195265,0.615,-4.712,0.817,...,0.349,0.894,0.0572,171.886,0,10.341291,0.0,0,Mildly Succesful,Succesful
218,0IcArqqdY0dWDwFTQV3zZE,achekak historique,Cheb Houssem,7,1,4,252052,0.647,-6.497,0.828,...,0.142,0.928,0.0554,172.014,16,15.858514,3.178054,7,Succesful,Succesful
2222,4RW0cU23xktVOvkbLxFmGU,Mafhamte ouallou,Cheb Bilal,4,0,4,313120,0.574,-4.125,0.915,...,0.324,0.89,0.0431,180.535,0,6.584791,1.94591,3,Unsuccesful,Mildly Succesful
1723,1fGGWbvw2Vb8TldpXLuZ2W,Niveaux tah,Cheb Bilal,1,1,4,322640,0.676,-5.639,0.842,...,0.0999,0.594,0.05,110.018,2,14.122646,4.465908,43,Succesful,Mildly Succesful
1836,0auCj2x94K2w2nTWKZ6GG8,Slam Klani,Khaled,1,1,3,376692,0.535,-11.65,0.592,...,0.0785,0.839,0.22,185.077,1,9.500395,3.295837,17,Unsuccesful,Mildly Succesful
253,1OY5epFo92PoddBVO0TJde,Fa9r Ou Saada,Warda,5,0,4,359448,0.627,-2.266,0.866,...,0.0606,0.961,0.0656,93.972,15,7.41698,0.0,0,Unsuccesful,Succesful
1625,6vzkPT7DUjvREkyayogRA6,Chitnouk alia,Khaled,0,1,4,270565,0.206,-14.05,0.371,...,0.141,0.375,0.0626,81.789,2,6.190315,3.688879,33,Unsuccesful,Unsuccesful
2350,4YhhbjDAZ6qBvxlm2Ih2Av,Ezzine icheyeb (Version remasterisée),Cheb Mami,0,1,4,367973,0.576,-3.548,0.884,...,0.208,0.838,0.0467,95.961,0,4.442651,1.94591,5,Unsuccesful,Unsuccesful
1833,1Tzj0bb35XiTE65TGOVg3h,Ya hbabi rani fi hala - Live,Cheb Houssem,11,1,4,120480,0.44,-7.851,0.803,...,0.347,0.79,0.158,96.516,1,5.710427,2.890372,2,Unsuccesful,Unsuccesful


## Feature importance

In [89]:
for name, score in zip(selected_features, forest.feature_importances_):
    print(name, score)

duration_ms_x 0.11661926294505981
danceability 0.11936234176846648
loudness 0.08284508109204071
energy 0.08938294028348398
instrumentalness 0.12260667960711809
liveness 0.09130637610308692
valence 0.1209622123247974
speechiness 0.14388266713599254
tempo 0.11303243873995401


## Hidden gems

In [90]:
hdf = test.loc[(test['popularity_cat'] == 'Unsuccesful') & (test['prediction'] == 'Succesful'), :]
hdf.sample()

Unnamed: 0,id,name,main_artist,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,...,liveness,valence,speechiness,tempo,popularity,youtube_views,last_fm_playcount,last_fm_listeners,popularity_cat,prediction
253,1OY5epFo92PoddBVO0TJde,Fa9r Ou Saada,Warda,5,0,4,359448,0.627,-2.266,0.866,...,0.0606,0.961,0.0656,93.972,15,7.41698,0.0,0,Unsuccesful,Succesful
