In [297]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

selected_features = ["key", "mode", "time_signature", "duration_ms_x","danceability", "loudness", "energy", "instrumentalness", "liveness", "valence", "speechiness", "tempo"] 

SUCCESS_FEATURE = 'youtube_views'
SUCCESS_THRESHOLD = 0.75

df = pd.read_csv("rai.csv")
df = df.sort_values(by=['popularity'])
df = df.drop_duplicates(subset=['id'])
df = df.drop_duplicates(subset=['name', 'main_artist'], keep='first')
df = df.drop_duplicates(subset=selected_features, keep='first')

## Feature engineering

### Feature scaling and centering

In [298]:
df.youtube_views = np.log1p(df.youtube_views)
df.last_fm_playcount = np.log1p(df.last_fm_playcount)

### Defining popularity

In [299]:
df['popularity_cat'] = "Unsuccesful"
# TODO Use spotify, Youtube and Last FM 
df.loc[(df[SUCCESS_FEATURE] > df[SUCCESS_FEATURE].quantile(SUCCESS_THRESHOLD)), 'popularity_cat'] = "Succesful"

## Split & prepare the data

In [300]:
train, test = train_test_split(df, test_size=0.2)


selected_target = "popularity_cat"

train_x = train[selected_features] 
train_y = train[selected_target]

test_x = test[selected_features] 
test_y = test[selected_target]

## Building the model

In [301]:
from sklearn.ensemble import RandomForestClassifier
tree = RandomForestClassifier(max_depth=4, n_estimators=20, min_samples_split=3, min_samples_leaf=2)

In [302]:
tree.fit(X = train_x, y = train_y)

RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=5)

In [303]:
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
rf_predict = tree.predict(test_x)
train_predict = tree.predict(train_x)
print("Training accuracy: " + str(accuracy_score(train_y, train_predict)))
print("Testing accuracy: " + str(accuracy_score(test_y, rf_predict)))

Training accuracy: 0.7655502392344498
Testing accuracy: 0.7261146496815286


In [304]:
test["prediction"] = rf_predict
test.sample(10)

Unnamed: 0,id,name,main_artist,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,...,liveness,valence,speechiness,tempo,popularity,youtube_views,last_fm_playcount,last_fm_listeners,popularity_cat,prediction
1327,0DNjyfgcfoA3PzdKNRir3b,Gaa Nabghou Drahem,Cheb Bilal,9,1,4,370311,0.704,-5.096,0.949,...,0.072,0.886,0.0698,108.006,3,14.615519,1.098612,2,Succesful,Unsuccesful
695,0Nl1kHtSCO7bYKWsn49fjC,تجارة,Cheb Bilal,1,1,4,203586,0.692,-3.581,0.881,...,0.134,0.933,0.0701,120.035,6,15.978151,0.0,0,Succesful,Unsuccesful
723,54YyeW3RZGAD5PA0jbakb3,Wali Imigré,Kader Japonais,7,1,4,312586,0.429,-5.321,0.937,...,0.627,0.364,0.36,96.08,6,15.998746,0.0,0,Succesful,Unsuccesful
1688,0gHIteQjluuU9pnzYwfV7V,Charak gataa,Cheikha Rimitti,6,0,4,226146,0.544,-9.628,0.553,...,0.102,0.89,0.0337,107.051,2,11.36842,3.401197,19,Unsuccesful,Unsuccesful
1898,0YPYxAY5jdEHlwWQKKOBuM,Hablatek,Cheb Bilal,6,1,4,374853,0.554,-8.822,0.573,...,0.122,0.452,0.0457,89.0,1,15.192882,2.772589,4,Succesful,Unsuccesful
85,1NpCgWDsQc62AJCHMjqryF,Sabran (Ya Ghali),Cheb Mami,11,0,4,230560,0.806,-5.786,0.834,...,0.213,0.568,0.048,97.002,25,11.814784,7.845024,1019,Unsuccesful,Unsuccesful
473,1gjFJB75Sw71kdEiZC4PeI,Yali goultou sahla,Cheb Bilal,9,0,4,342466,0.729,-7.974,0.725,...,0.115,0.848,0.0363,131.018,9,13.837883,2.484907,5,Succesful,Unsuccesful
573,5zWx0VR3F2s5rH0iNQjRDa,Halaou Laou,Cheb Houssem,7,1,4,631470,0.659,-5.561,0.905,...,0.344,0.916,0.0409,98.025,8,11.246352,4.276666,14,Unsuccesful,Unsuccesful
1812,3Iw5yudaSDoCFqF1yClaxy,Ksemti belleh,Cheb Bilal,1,0,4,321346,0.557,-7.841,0.722,...,0.069,0.786,0.0857,171.988,1,14.414362,2.890372,6,Succesful,Unsuccesful
268,6WQfhbOn5JTUJ4IVy7KZnO,الفقر و السعادة,Warda,5,0,4,300000,0.696,-1.404,0.892,...,0.0541,0.962,0.0617,93.994,15,14.409094,0.0,0,Succesful,Unsuccesful


## Feature importance

In [305]:
for name, score in zip(selected_features, tree.feature_importances_):
    print(name, score)

key 0.05351658779454131
mode 0.019001341362521292
time_signature 0.006645769162628864
duration_ms_x 0.08482956463589594
danceability 0.07708168325625901
loudness 0.08416154539117747
energy 0.14061642243078254
instrumentalness 0.07749308478114333
liveness 0.11454402038792878
valence 0.10522438884852013
speechiness 0.1153189699306076
tempo 0.12156662201799373


## Hidden gems

In [306]:
hdf = test.loc[(test['popularity_cat'] == 'Unsuccesful') & (test['prediction'] == 'Succesful'), :]
hdf.sample(5)

ValueError: a must be greater than 0 unless no samples are taken