In [45]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("rai.csv")
df = df.sort_values(by=['popularity'])
df = df.drop_duplicates(subset=['id'])
df = df.drop_duplicates(subset=['name', 'main_artist'], keep='first')

In [46]:
df.describe()

Unnamed: 0,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,instrumentalness,liveness,valence,speechiness,tempo,popularity,youtube_views
count,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0
mean,5.311059,0.450621,3.896511,346177.345358,0.56827,-7.630745,0.75308,0.021678,0.206796,0.762262,0.085771,122.057871,5.551153,2075357.0
std,3.689292,0.497703,0.353252,100362.876571,0.135562,3.840073,0.160376,0.111717,0.170393,0.17816,0.065963,35.231541,7.362882,7123416.0
min,0.0,0.0,1.0,49266.0,0.196,-27.008,0.037,0.0,0.0173,0.0363,0.0235,53.985,0.0,0.0
25%,1.0,0.0,4.0,290291.0,0.473,-9.9575,0.6635,0.0,0.08755,0.668,0.044,93.9635,1.0,3735.0
50%,6.0,0.0,4.0,342439.0,0.59,-6.973,0.789,1.2e-05,0.143,0.81,0.062,103.194,3.0,71478.0
75%,9.0,1.0,4.0,388093.0,0.674,-4.9495,0.8735,0.00032,0.29,0.8985,0.09855,158.877,8.0,947635.0
max,11.0,1.0,5.0,945016.0,0.883,1.108,0.994,0.939,0.982,0.983,0.525,206.878,47.0,117588000.0


## Feature engineering

In [47]:
df['popularity_cat'] = 0
df.loc[(df['popularity'] < 8), 'popularity_cat' ] = 0 
df.loc[(df['popularity'] >= 8), 'popularity_cat'] = 1

## Split & prepare the data

In [48]:
train, test = train_test_split(df, test_size=0.2)

selected_features = ["key", "mode", "time_signature", "duration_ms_x","danceability", "loudness", "energy", "instrumentalness", "liveness", "valence", "speechiness", "tempo"] 
selected_target = "popularity_cat"

train_x = train[selected_features] 
train_y = train[selected_target]

test_x = test[selected_features] 
test_y = test[selected_target]

## Building the model

In [49]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10)

In [50]:
forest.fit(X = train_x, y = train_y)

RandomForestClassifier(n_estimators=10)

In [51]:
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
rf_predict = forest.predict(test_x)
accuracy_score(test_y, rf_predict)

0.7286135693215339

In [52]:
test["prediction"] = rf_predict
test.head()

Unnamed: 0,id,name,main_artist,key,mode,time_signature,duration_ms_x,danceability,loudness,energy,instrumentalness,liveness,valence,speechiness,tempo,popularity,youtube_views,popularity_cat,prediction
673,39S81pUG3BUeCpACD7iAXE,Nagouad Nabeghik,Cheb Djalil,9,1,4,418841,0.448,-8.806,0.848,0.00285,0.142,0.913,0.0685,89.93,7,1595,0,0
805,0KdtSvEZFB1wvBNzo9d2bW,لوكان يغيب غير نهار,Warda,8,0,4,455706,0.589,-8.281,0.831,0.000388,0.1,0.951,0.0848,93.899,6,286934,0,0
1452,3thILVL2mKnzDCZafm0OTR,ROUKED EL ACHRA,Cheikha Rimitti,7,0,4,706280,0.533,-5.147,0.92,0.0,0.38,0.541,0.102,171.705,3,9739,0,0
7,0GDjF1aq0W3hZN9whZtcmw,Al Hachwa Hachwa,Cheb Bello,4,1,4,315715,0.638,-5.637,0.883,1e-05,0.0814,0.965,0.0423,89.898,42,50406176,1,1
1957,0TuMU1h5dlqSx3kzL8csf2,Bafana bafana,Cheb Bilal,1,1,4,286217,0.688,-3.727,0.932,3e-06,0.809,0.722,0.044,135.969,1,130121,0,0


## Plotting an estimator

In [53]:
estimator = forest.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = selected_features,
                class_names = selected_target,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

import os
os.system('dot -Tpng tree.dot -o tree.png')


1