In [1]:
import pandas as pd

path = '/Users/brunobarbieri/Library/CloudStorage/OneDrive-UniversityofPisa/TA_Project/data/'
df = pd.read_csv(path + "lab_lem_merge.csv")

In [2]:
import ast

df['lemmatized_stanzas'] = df['lemmatized_stanzas'].apply(ast.literal_eval)
df

Unnamed: 0.1,Unnamed: 0,id,title,artist,year,views,features,is_country,is_pop,is_rap,is_rb,is_rock,stanza_number,is_chorus,lemmatized_stanzas,label
0,0,0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,0,False,"[she, get, a, broke, down, el, camino, in, the...",anger
1,1,0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,1,True,"[something, in, the, water, something, in, the...",anger
2,2,0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,2,False,"[she, do, her, makeup, and, hair, to, cook, fr...",anticipation
3,3,0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,4,False,"[my, hoosi, girl, be, so, fine, shake, the, wa...",fear
4,4,0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,5,True,"[something, in, the, water, something, in, the...",fear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66910,66910,15437,PRETTY,Kennie J.D.,2020,978,{''},False,True,False,False,False,4,True,"[do, she, know, you, call, I, after, hour, whi...",fear
66911,66911,15437,PRETTY,Kennie J.D.,2020,978,{''},False,True,False,False,False,5,False,"[red, dress, red, everything, black, silk, wha...",joy
66912,66912,15438,Crawling Back,Alex Goot,2013,65,{''},False,True,False,False,False,0,False,"[I, try, to, leave, you, once, but, I, could, ...",anticipation
66913,66913,15439,Jersey,Naive New Beaters,2015,454,{''},False,True,False,False,False,0,False,"[ride, cadillac, just, like, dr, dre, dre, dre...",joy


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
# import numpy as np


# Step 1: Convert token lists back into space-separated strings
# (needed for vectorizer)
# texts_str = df['lemmatized_stanzas'].apply(
#     lambda tokens: " ".join(tokens)
# )
# print(texts_str)
df['text_str'] = df['lemmatized_stanzas'].apply(lambda x: ' '.join(x))

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df[[
        'text_str', 'stanza_number', 'is_country',
        'is_pop', 'is_rap', 'is_rb', 'is_rock', 'is_chorus'
    ]],
    df['label'], test_size=0.3, random_state=42
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FunctionTransformer

def convert_bool_to_int(x):
    return x.astype(int)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'text_str'),
        ('scaler', StandardScaler(), ['stanza_number']),
        (
            'bools', FunctionTransformer(
                convert_bool_to_int, validate=False
            ), [
                'is_country', 'is_pop', 'is_rap',
                'is_rb', 'is_rock', 'is_chorus'
            ]
        )

])


# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Define the pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_param_distributions = {
    'preprocessor__text__max_features': [500, 1000, 5000, None],  # Max features for TF-IDF
    'preprocessor__text__ngram_range': [(1, 1), (1, 2)],          # Unigrams or bigrams
    'classifier__n_estimators': [50, 100, 200, 300],              # Number of trees
    'classifier__max_depth': [None, 10, 20, 30],                  # Tree depth
    'classifier__min_samples_split': [2, 5, 10],                  # Min samples per split
    'classifier__min_samples_leaf': [1, 2, 4],                    # Min samples per leaf
    'classifier__bootstrap': [True, False],                       # Bootstrap sampling
}

# RandomizedSearchCV setup
random_search_rf = RandomizedSearchCV(
    estimator= rf_pipeline,
    param_distributions= rf_param_distributions,
    n_iter=20,                                  # Number of random combinations to try
    cv=5,                                       # 5-fold cross-validation
    scoring='accuracy',                         # Metric to optimize
    verbose=2,
    random_state=42,
    n_jobs=-1                                   # Use all available cores
)

In [7]:
# Fit RandomizedSearchCV to the data
random_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=4, classifier__min_samples_split=10, classifier__n_estimators=300, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=  37.3s
[CV] END classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=4, classifier__min_samples_split=10, classifier__n_estimators=300, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=  37.4s
[CV] END classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=4, classifier__min_samples_split=10, classifier__n_estimators=300, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=  37.6s
[CV] END classifier__bootstrap=True, classifier__max_depth=20, classifier__min_samples_leaf=4, classifier__min_samples_split=10, classifier__n_estimators=300, preproce

In [8]:
# Best parameters and cross-validation accuracy
print(f"Best Parameters: {random_search_rf.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search_rf.best_score_}")

# Predict on the test set
y_pred = random_search_rf.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Best Parameters: {'preprocessor__text__ngram_range': (1, 1), 'preprocessor__text__max_features': None, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__bootstrap': False}
Best Cross-Validation Accuracy: 0.3541204099060632
              precision    recall  f1-score   support

       anger       0.43      0.68      0.53      4012
anticipation       0.52      0.09      0.16      1820
     disgust       0.47      0.07      0.12      1234
        fear       0.31      0.40      0.35      3615
         joy       0.31      0.55      0.39      3126
     sadness       0.48      0.18      0.27      2424
    surprise       0.45      0.08      0.14      1295
       trust       0.35      0.24      0.29      2549

    accuracy                           0.37     20075
   macro avg       0.41      0.29      0.28     20075
weighted avg       0.40      0.37      0.33     20075



In [9]:
# saves the model
import joblib
from datetime import datetime

model_path = '/Users/brunobarbieri/Library/CloudStorage/OneDrive-UniversityofPisa/TA_Project/models/'

import joblib
joblib.dump(
    random_search_rf.best_estimator_,
    model_path + 'RF_' + datetime.now().strftime("%d-%m-%Y_%H-%M-%S") + '.pkl'
)

['/Users/brunobarbieri/Library/CloudStorage/OneDrive-UniversityofPisa/TA_Project/models/RF_04-12-2024_23-45-10.pkl']

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

# Binarizzazione delle etichette per il supporto multi-classe
classes = df['label'].unique()
y_train_bin = label_binarize(y_train, classes=classes)
y_test_bin = label_binarize(y_test, classes=classes)

# Calcolo delle probabilità del modello
# (Nota: LinearSVC non fornisce probabilità, ma possiamo usare la decision_function)
y_score = random_search_rf.decision_function(X_test)

# Plot delle ROC curve per ogni classe
plt.figure(figsize=(10, 8))
for i, class_name in enumerate(classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {class_name} (AUC = {roc_auc:.2f})")

# Aggiungi la linea di riferimento (y=x)
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Configura il grafico
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Random")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

# SVM

In [None]:
from sklearn.svm import LinearSVC

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(random_state=42))
])

svm_param_distributions = {
    'preprocessor__text__max_features': [500, 1000, 5000, None],
    'preprocessor__text__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__degree': [2, 3, 4],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__class_weight': [None, 'balanced'],
}



# RandomizedSearchCV setup
random_search_svm = RandomizedSearchCV(
    estimator= svm_pipeline,
    param_distributions= svm_param_distributions,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [11]:
# Fit RandomizedSearchCV to the data
random_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


Python(51187) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(51188) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(51189) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__degree=3, classifier__gamma=scale, classifier__kernel=poly, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=24.8min
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__degree=3, classifier__gamma=scale, classifier__kernel=poly, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=24.9min
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__degree=3, classifier__gamma=scale, classifier__kernel=poly, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=25.3min
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__degree=3, classifier__gamma=scale, classifier__kernel=poly, preprocessor__text__max_features=5000, preprocessor__text__ngram_range=(1, 1); total time=24.4min
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__degree=3, cla

KeyboardInterrupt: 

In [None]:
# Best parameters and cross-validation accuracy
print(f"Best Parameters: {random_search_svm.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search_svm.best_score_}")

# Predict on the test set
y_pred = random_search_svm.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Best Parameters: {'preprocessor__text__ngram_range': (1, 1), 'preprocessor__text__max_features': None, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__bootstrap': False}
Best Cross-Validation Accuracy: 0.4204524154195076
              precision    recall  f1-score   support

       anger       0.42      0.93      0.58      2979
anticipation       0.44      0.19      0.26       921
     disgust       0.37      0.06      0.11       524
        fear       0.51      0.25      0.33      1667
         joy       0.48      0.11      0.18       445
     sadness       0.42      0.22      0.29      1110
    surprise       0.48      0.10      0.16       494
       trust       0.44      0.08      0.13       623

    accuracy                           0.43      8763
   macro avg       0.45      0.24      0.25      8763
weighted avg       0.44      0.43      0.36      8763



In [None]:
joblib.dump(
    random_search_rf.best_estimator_,
    model_path + 'SVM_' + datetime.now().strftime("%d-%m-%Y_%H-%M-%S") + '.pkl'
)

['/Users/brunobarbieri/Library/CloudStorage/OneDrive-UniversityofPisa/TA_Project/models/best_svm_pipeline.pkl']

In [None]:
# Binarizzazione delle etichette per il supporto multi-classe
classes = df['label'].unique()
y_train_bin = label_binarize(y_train, classes=classes)
y_test_bin = label_binarize(y_test, classes=classes)

# Calcolo delle probabilità del modello
# (Nota: LinearSVC non fornisce probabilità, ma possiamo usare la decision_function)
y_score = random_search_svm.decision_function(X_test)

# Plot delle ROC curve per ogni classe
plt.figure(figsize=(10, 8))
for i, class_name in enumerate(classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {class_name} (AUC = {roc_auc:.2f})")

# Aggiungi la linea di riferimento (y=x)
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Configura il grafico
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Multi-Class SVM (One-vs-Rest)")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()