In [81]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_regression

In [82]:
#Read data
data = pd.read_csv(Path('./second_dataset.csv'))
df = pd.DataFrame(data)

In [83]:
#Drop insignificant columns
df = df.drop(columns=['artist_name','track_id', 'track_name','key', 'mode', 'time_signature'])

In [84]:
df.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,duration
0,0.112,0.653,0.524,0.0,0.203,-9.016,0.0502,83.97,0.553,97,160.0
1,0.468,0.737,0.802,0.0,0.0931,-4.771,0.0878,144.015,0.682,92,172.0
2,0.779,0.699,0.304,0.000993,0.137,-11.192,0.0993,119.705,0.271,83,160.0
3,0.207,0.774,0.554,0.0,0.132,-7.909,0.0383,99.034,0.349,82,163.0
4,0.737,0.483,0.412,0.0,0.116,-8.461,0.0402,170.163,0.247,81,204.0


In [85]:
df['is_popular'] = df['popularity'].apply(lambda x: 1 if x>60 else 0)

In [86]:
df.sample(5)

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,duration,is_popular
7794,0.441,0.622,0.22,0.719,0.109,-25.284,0.0456,159.935,0.0816,18,168.0,0
4344,0.96,0.465,0.14,0.941,0.0733,-17.304,0.0261,84.109,0.255,4,152.0,0
987,0.323,0.515,0.519,0.000657,0.0566,-11.197,0.0397,89.499,0.582,44,383.0,0
8867,0.787,0.247,0.0257,0.0,0.151,-23.576,0.0355,70.372,0.15,5,225.0,0
5813,2.7e-05,0.487,0.991,0.0,0.177,-1.741,0.0673,110.019,0.369,46,296.0,0


In [87]:
# Define the features set.
X = df.copy()
X = X.drop(columns=["popularity", 'is_popular'])
X.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,duration
0,0.112,0.653,0.524,0.0,0.203,-9.016,0.0502,83.97,0.553,160.0
1,0.468,0.737,0.802,0.0,0.0931,-4.771,0.0878,144.015,0.682,172.0
2,0.779,0.699,0.304,0.000993,0.137,-11.192,0.0993,119.705,0.271,160.0
3,0.207,0.774,0.554,0.0,0.132,-7.909,0.0383,99.034,0.349,163.0
4,0.737,0.483,0.412,0.0,0.116,-8.461,0.0402,170.163,0.247,204.0


In [88]:
# Define the target set
y = df["is_popular"].values
y[:5]

array([1, 1, 1, 1, 1])

In [89]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [90]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7499, 10)
(2500, 10)
(7499,)
(2500,)


In [91]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [92]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [93]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [94]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [95]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2250,135
Actual 1,105,10


In [96]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.904

In [97]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2250,135
Actual 1,105,10


Accuracy Score : 0.904
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      2385
           1       0.07      0.09      0.08       115

    accuracy                           0.90      2500
   macro avg       0.51      0.52      0.51      2500
weighted avg       0.91      0.90      0.91      2500

