In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from config import db_password

In [2]:
#Connect to the database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/finalproject"
engine = create_engine(db_string)
dbConnection = engine.connect()

In [3]:
sdc_df = pd.read_sql("select * from \"cleansongdata\"", dbConnection)
sdc_df = sdc_df.drop(['index'], axis=1) #drop the original index column
sdc_df

Unnamed: 0,winner,Acousticness,Danceability,Duration,Energy,Explicit,Instrumentalness,Liveness,Loudness,Mode,Popularity,Speechiness,Tempo,TimeSignature,Valence
0,0,0.0332,0.754,198973,0.449,1,8.3e-05,0.552,-9.211,1,86,0.109,77.169,4,0.357
1,0,0.327,0.236,279373,0.578,0,0.00015,0.206,-5.477,1,62,0.0404,75.367,4,0.227
2,0,0.198,0.655,229360,0.797,0,0.0,0.067,-4.787,1,79,0.153,177.928,4,0.839
3,0,0.0742,0.796,219333,0.766,1,0.0,0.0827,-5.974,1,26,0.238,110.034,4,0.558
4,0,0.103,0.76,231827,0.703,0,0.0,0.0913,-5.412,1,60,0.054,95.997,4,0.57
5,0,0.842,0.5,193320,0.225,0,0.0,0.112,-10.22,1,23,0.0302,173.788,3,0.355
6,0,0.0163,0.286,216120,0.784,0,6.5e-05,0.0713,-2.873,1,34,0.0749,173.793,4,0.618
7,0,0.0647,0.647,219200,0.8,0,0.0,0.334,-5.384,1,59,0.165,160.078,4,0.942
8,0,0.00487,0.554,223546,0.772,0,7e-06,0.354,-4.821,0,78,0.0418,179.984,4,0.455
9,0,0.0442,0.564,221947,0.939,0,0.0,0.112,-4.278,1,46,0.0478,116.025,4,0.613


In [4]:
#Separate the features (X) from the target (y)
y = sdc_df["winner"]
X = sdc_df.drop(columns="winner")

In [5]:
#Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(41, 14)

In [6]:
#Scale the data; the duration and tempo columns may have a disproportionate impact.
scaler = StandardScaler()

In [7]:
#fit the standard sclaer
X_scaler = scaler.fit(X_train)

In [8]:
#Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Gradient Boosted Tree

In [9]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=1)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.976
Accuracy score (validation): 0.786

Learning rate:  0.1
Accuracy score (training): 0.976
Accuracy score (validation): 0.786

Learning rate:  0.25
Accuracy score (training): 1.000
Accuracy score (validation): 0.786

Learning rate:  0.5
Accuracy score (training): 1.000
Accuracy score (validation): 0.857

Learning rate:  0.75
Accuracy score (training): 1.000
Accuracy score (validation): 0.929

Learning rate:  1
Accuracy score (training): 1.000
Accuracy score (validation): 0.929



In [10]:
# Choose a learning rate and create classifier
gbtmodel = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.75,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=1)

# Fit the model
gbtmodel.fit(X_train_scaled, y_train)

# Make Prediction
y_predgbt = gbtmodel.predict(X_test_scaled)
pd.DataFrame({"Prediction": y_predgbt, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
12,0,0
1,0,0
16,0,0
8,0,0
43,0,0
37,0,0
15,0,0
51,0,1
0,0,0
53,1,1


In [11]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predgbt)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_predgbt)
# Display the results
print("Confusion Matrix:")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_predgbt))

Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11,0
Actual 1,1,2


Accuracy Score : 0.9285714285714286
Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.67      0.80         3

    accuracy                           0.93        14
   macro avg       0.96      0.83      0.88        14
weighted avg       0.93      0.93      0.92        14



In [12]:
# Calculate feature importance in the GBT model.
importances = gbtmodel.feature_importances_
# We can sort the features by their importance.
sorted(zip(gbtmodel.feature_importances_, X.columns), reverse=True)

[(0.3119335485322745, 'Duration'),
 (0.2554182627318052, 'Energy'),
 (0.1695096951645934, 'Valence'),
 (0.13904476565473944, 'Instrumentalness'),
 (0.11283182970397886, 'Loudness'),
 (0.005321609398107835, 'Acousticness'),
 (0.004370491905124893, 'Popularity'),
 (0.0012166615581492277, 'Liveness'),
 (0.00019930757119071533, 'Speechiness'),
 (7.806482172303943e-05, 'Danceability'),
 (6.675616266811452e-05, 'Tempo'),
 (9.006795644731848e-06, 'Mode'),
 (0.0, 'TimeSignature'),
 (0.0, 'Explicit')]

# Drop any feature less than .11 in importance and re-run the model.

In [13]:
#Dropping all features with less than .11 in importance or that would not have a rational impact.
important_sdc_df = sdc_df.drop(['Explicit', 'TimeSignature', 'Mode', 
                                'Tempo', 'Danceability', 'Speechiness', 
                                'Liveness', 'Popularity', 'Acousticness'], axis=1)
important_sdc_df

Unnamed: 0,winner,Duration,Energy,Instrumentalness,Loudness,Valence
0,0,198973,0.449,8.3e-05,-9.211,0.357
1,0,279373,0.578,0.00015,-5.477,0.227
2,0,229360,0.797,0.0,-4.787,0.839
3,0,219333,0.766,0.0,-5.974,0.558
4,0,231827,0.703,0.0,-5.412,0.57
5,0,193320,0.225,0.0,-10.22,0.355
6,0,216120,0.784,6.5e-05,-2.873,0.618
7,0,219200,0.8,0.0,-5.384,0.942
8,0,223546,0.772,7e-06,-4.821,0.455
9,0,221947,0.939,0.0,-4.278,0.613


In [14]:
#Separate the features (X) from the target (y)
y2 = important_sdc_df["winner"]
X2 = important_sdc_df.drop(columns="winner")

In [15]:
#Split data into training and testing
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=1, stratify=y)
X_train2.shape

(41, 5)

In [16]:
#Scale the data; the duration feature may have a disproportionate impact.
X_scaler2 = scaler.fit(X_train2)
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

In [17]:
#Create classifier
gbtmodel2 = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.75,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=1)

# Fit the model
gbtmodel2.fit(X_train_scaled2, y_train2)

# Make Prediction
y_predgbt2 = gbtmodel2.predict(X_test_scaled2)
pd.DataFrame({"Prediction": y_predgbt2, "Actual": y_test2})

Unnamed: 0,Prediction,Actual
12,0,0
1,0,0
16,0,0
8,0,0
43,0,0
37,0,0
15,0,0
51,0,1
0,0,0
53,1,1


In [18]:
#Evaluate and Output evaluation
# Calculating the confusion matrix
cm2 = confusion_matrix(y_test2, y_predgbt2)
cm_df2 = pd.DataFrame(cm2, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Calculating the accuracy score
acc_score2 = accuracy_score(y_test2, y_predgbt2)
# Display the results
print("Confusion Matrix:")
display(cm_df2)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test2, y_predgbt2))

Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11,0
Actual 1,1,2


Accuracy Score : 0.9285714285714286
Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.67      0.80         3

    accuracy                           0.93        14
   macro avg       0.96      0.83      0.88        14
weighted avg       0.93      0.93      0.92        14



In [21]:
# Calculate feature importance in the GBT model.
importances2 = gbtmodel2.feature_importances_
# Sort the features by their importance.
print("Ordered Importances:")
sorted(zip(gbtmodel2.feature_importances_, X2.columns), reverse=True)

Ordered Importances:


[(0.3122008663987821, 'Valence'),
 (0.29611562876062675, 'Duration'),
 (0.2793771145228588, 'Instrumentalness'),
 (0.08509594611226944, 'Energy'),
 (0.027210444205462887, 'Loudness')]