In [24]:
import pandas as pd
import matplotlib.pyplot as plt

In [25]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,track_ids,track_names,artists,first_artists,danceability,energy,loudness,mode,acousticness,instrumentalness,valence,tempo,mood
0,1k1Bqnv2R0uJXQN4u6LKYt,Ain't No Sunshine,['Bill Withers'],Bill Withers,0.527,0.415,-11.451,0,0.457,1.7e-05,0.515,78.169,Happy
1,3zBhihYUHBmGd2bcQIobrF,(Sittin' On) the Dock of the Bay,['Otis Redding'],Otis Redding,0.768,0.367,-11.226,1,0.683,1.8e-05,0.532,103.621,Happy
2,3SdTKo2uVsxFblQjpScoHy,Stand By Me,['Ben E. King'],Ben E. King,0.65,0.306,-9.443,1,0.57,7e-06,0.605,118.068,Happy
3,3NfxSdJnVdon1axzloJgba,I Say a Little Prayer,['Aretha Franklin'],Aretha Franklin,0.592,0.355,-14.051,1,0.478,0.0,0.499,133.032,Happy
4,4kP69y3GKHi9tXckfgp4bK,For Once In My Life,['Stevie Wonder'],Stevie Wonder,0.524,0.519,-11.903,1,0.195,0.0,0.847,110.121,Happy


In [26]:
labels = df['mood'].copy()

columns_to_drop = ['track_ids', 'track_names', 'artists', 'first_artists', 'mood', 'instrumentalness']
cleaned_df = df.drop(columns=columns_to_drop)

cleaned_df.head()

Unnamed: 0,danceability,energy,loudness,mode,acousticness,valence,tempo
0,0.527,0.415,-11.451,0,0.457,0.515,78.169
1,0.768,0.367,-11.226,1,0.683,0.532,103.621
2,0.65,0.306,-9.443,1,0.57,0.605,118.068
3,0.592,0.355,-14.051,1,0.478,0.499,133.032
4,0.524,0.519,-11.903,1,0.195,0.847,110.121


In [27]:
correlation_matrix = cleaned_df.corr()
print(correlation_matrix)

              danceability    energy  loudness      mode  acousticness  \
danceability      1.000000  0.166548  0.141086 -0.068504     -0.206953   
energy            0.166548  1.000000  0.745147 -0.132598     -0.747508   
loudness          0.141086  0.745147  1.000000 -0.081610     -0.592130   
mode             -0.068504 -0.132598 -0.081610  1.000000      0.095857   
acousticness     -0.206953 -0.747508 -0.592130  0.095857      1.000000   
valence           0.460710  0.438028  0.223395  0.004844     -0.333560   
tempo            -0.169570  0.210749  0.117271 -0.035054     -0.147751   

               valence     tempo  
danceability  0.460710 -0.169570  
energy        0.438028  0.210749  
loudness      0.223395  0.117271  
mode          0.004844 -0.035054  
acousticness -0.333560 -0.147751  
valence       1.000000  0.050325  
tempo         0.050325  1.000000  


In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

clf = DecisionTreeClassifier()
params = {"max_depth": [4, 6, 8],
          "max_features": [4, 6, 8]}
grid_search = GridSearchCV(clf, params, cv = 5, scoring = 'accuracy', n_jobs = -1)
y_pred = cross_val_predict(grid_search, cleaned_df, labels, cv = 5)

accuracy = accuracy_score(labels, y_pred)
print("Accuracy: ", accuracy)
print("Classification Report:")
print(classification_report(labels, y_pred))

conf_matrix = confusion_matrix(labels, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy:  0.7434354485776805
Classification Report:
              precision    recall  f1-score   support

       Happy       0.76      0.74      0.75       933
         Sad       0.73      0.75      0.74       895

    accuracy                           0.74      1828
   macro avg       0.74      0.74      0.74      1828
weighted avg       0.74      0.74      0.74      1828

Confusion Matrix:
[[686 247]
 [222 673]]


In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
pca = PCA()
knn = KNeighborsClassifier(n_neighbors = 7)
pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('knn', knn)
])

param_grid = {
    'pca__n_components': list(range(2, 8)),
    'knn__n_neighbors': list(range(2, 5))
}

grid_search = GridSearchCV(pipeline, param_grid, cv = 5, n_jobs = -1)
y_pred = cross_val_predict(grid_search, cleaned_df, labels, cv = 5)

accuracy = accuracy_score(labels, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(labels, y_pred))

conf_matrix = confusion_matrix(labels, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7303063457330415
Classification Report:
              precision    recall  f1-score   support

       Happy       0.74      0.72      0.73       933
         Sad       0.72      0.74      0.73       895

    accuracy                           0.73      1828
   macro avg       0.73      0.73      0.73      1828
weighted avg       0.73      0.73      0.73      1828

Confusion Matrix:
[[670 263]
 [230 665]]


In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings

# Define the pipeline
pipeline_mlp = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(max_iter = 20, solver = 'adam', random_state = 8))
])

# Define the parameter grid for grid search
param_grid_mlp = {
    'mlp__hidden_layer_sizes': [5, 10, 15, 20],
    'mlp__activation': ['logistic', 'tanh', 'relu']
}

# Perform grid search
grid_search_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp, cv = 5, n_jobs = -1)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    y_pred = cross_val_predict(grid_search_mlp, cleaned_df, labels, cv = 5)

accuracy = accuracy_score(labels, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(labels, y_pred))

conf_matrix = confusion_matrix(labels, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.776258205689278
Classification Report:
              precision    recall  f1-score   support

       Happy       0.79      0.77      0.78       933
         Sad       0.77      0.78      0.77       895

    accuracy                           0.78      1828
   macro avg       0.78      0.78      0.78      1828
weighted avg       0.78      0.78      0.78      1828

Confusion Matrix:
[[720 213]
 [196 699]]


In [31]:
from sklearn.ensemble import RandomForestClassifier

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])

param_grid_rf = {
    'rf__max_depth': list(range(2, 8)),
    'rf__max_features': ["sqrt", "log2"]
}

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, n_jobs=-1)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    y_pred_rf = cross_val_predict(grid_search_rf, cleaned_df, labels, cv=5)

accuracy_rf = accuracy_score(labels, y_pred_rf)
print("Accuracy:", accuracy_rf)
print("Classification Report:")
print(classification_report(labels, y_pred_rf))

print("Confusion Matrix:")
print(confusion_matrix(labels, y_pred_rf))


Accuracy: 0.7664113785557987
Classification Report:
              precision    recall  f1-score   support

       Happy       0.79      0.74      0.76       933
         Sad       0.74      0.80      0.77       895

    accuracy                           0.77      1828
   macro avg       0.77      0.77      0.77      1828
weighted avg       0.77      0.77      0.77      1828

Confusion Matrix:
[[686 247]
 [180 715]]


In [33]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict

clf = AdaBoostClassifier()
params = {
    'n_estimators': [50, 100, 150, 200]
}
grid_search = GridSearchCV(clf, params, cv = 5, scoring = 'accuracy')
y_pred = cross_val_predict(grid_search, cleaned_df, labels, cv = 5)

accuracy = accuracy_score(labels, y_pred)
print("Accuracy: ", accuracy)
print("Classification Report:")
print(classification_report(labels, y_pred))

conf_matrix = confusion_matrix(labels, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy:  0.7532822757111597
Classification Report:
              precision    recall  f1-score   support

       Happy       0.77      0.74      0.75       933
         Sad       0.74      0.77      0.75       895

    accuracy                           0.75      1828
   macro avg       0.75      0.75      0.75      1828
weighted avg       0.75      0.75      0.75      1828

Confusion Matrix:
[[687 246]
 [205 690]]


In [37]:
import pickle

scaler = StandardScaler()
scaler.fit(cleaned_df) # Fit the StandardScaler with feature names to get rid of warning
pipeline_mlp = Pipeline([
    ('scaler', scaler),
    ('mlp', MLPClassifier(max_iter=20, solver='adam', random_state=8))
])

param_grid_mlp = {
    'mlp__hidden_layer_sizes': [5, 10, 15, 20],
    'mlp__activation': ['logistic', 'tanh', 'relu']
}

grid_search_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp, cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    mlp_model = grid_search_mlp.fit(cleaned_df.values, labels)

filename = 'finalized_model.sav'
pickle.dump(mlp_model, open(filename, 'wb'))
