# Libraries

In [22]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


# Load Dataset

In [23]:
df = pd.read_csv("spotify_dataset_final_cleaned.csv")
print("Total rows:", len(df))
df.head()


Total rows: 176052


Unnamed: 0,lyrics,mood
0,i m goin in for the kill i m doin it for a thr...,anger
1,bow to the king x i was prey praying to the be...,anger
2,you ain t never seen uh how a pimp be rollin s...,anger
3,every time i call you tell me that you soon co...,anger
4,you know it seems to me that everyone is conce...,anger


# Split Data

In [24]:
df = df.sample(frac=1, random_state=42)

X = df["lyrics"]
y = df["mood"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# TF-IDF

In [None]:
tfidf = TfidfVectorizer(
    max_features=250000,
    ngram_range=(1, 2),
    stop_words="english",
    min_df=2,
    
    sublinear_tf=True, # 1 + log(tf): to stop Long songs dominating features and Overweighting repeated words
    norm="l2"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# Train Models (Multinomial Naive Bayes, Logistic Regression & SVM)

In [None]:
# ---- Naive Bayes ----
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_acc = accuracy_score(y_test, nb.predict(X_test_tfidf))

# ---- Logistic Regression ----
lr = LogisticRegression(
    max_iter=6000,
    C=3.0, # Regularization to prevent overfitting. Small C = less regularization & large C = more regularization
    class_weight="balanced" # Balance class weights to account for imbalanced dataset and prevent majority class dominance
)
lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_test_tfidf)
lr_acc = accuracy_score(y_test, lr_pred)


# ---- SVM ----
svm = LinearSVC(
    C=2.0,
    class_weight="balanced",
    
    
    loss="hinge" # Encourages the model to separate classes with the widest possible margin
)
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)
svm_acc = accuracy_score(y_test, svm_pred)


print("Naive Bayes Accuracy:", nb_acc)
print("Logistic Regression Accuracy:", lr_acc)
print("SVM Accuracy:", svm_acc)


Naive Bayes Accuracy: 0.4292692624463946
Logistic Regression Accuracy: 0.6007213654823776
SVM Accuracy: 0.5821760245377865




# Save Best Model

In [29]:
accs = {
    "NB":  (nb, nb_acc),
    "LR":  (lr, lr_acc),
    "SVM": (svm, svm_acc)
}

best_name = max(accs, key=lambda k: accs[k][1])
best_model, best_acc = accs[best_name]

print("\nBEST MODEL:", best_name)
print("Accuracy:", best_acc)


pickle.dump(best_model, open("moodify_model.pkl", "wb"))
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

print("Saved model + TF-IDF.")




BEST MODEL: LR
Accuracy: 0.6007213654823776
Saved model + TF-IDF.


# Best Model's Classification Report

In [30]:

print(classification_report(y_test, best_model.predict(X_test_tfidf)))

              precision    recall  f1-score   support

       anger       0.61      0.64      0.63      8000
        fear       0.62      0.61      0.61      5176
         joy       0.56      0.55      0.56      8000
        love       0.63      0.66      0.65      5044
     sadness       0.59      0.58      0.59      8000
    surprise       0.58      0.51      0.55       991

    accuracy                           0.60     35211
   macro avg       0.60      0.59      0.60     35211
weighted avg       0.60      0.60      0.60     35211

