In [None]:
# Install the required libraries
!pip install transformers
!pip install tensorflow



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, TFBertModel, RobertaTokenizer, TFRobertaModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/tcc_ceds_music.csv')

In [None]:
df.columns

Index(['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre',
       'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
       'topic', 'age'],
      dtype='object')

In [None]:
X = df.drop(['genre','Unnamed: 0','topic','artist_name', 'track_name', 'release_date', 'len', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls', 'age'], axis=1)
y = df['genre']

In [None]:
# BERT
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# DistilBERT
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# RoBERTa
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = TFRobertaModel.from_pretrained('roberta-base')

In [None]:
# Encode the genre labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(y)

In [None]:
tokenized = X['lyrics'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))

In [None]:
tokenized.shape

(28372,)

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(28372, 512)

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(28372, 512)

In [None]:
input_ids = torch.tensor(padded)

In [None]:
# attention_mask = torch.tensor(np.where(padded != 0, 1, 0))

# with torch.no_grad():
#     last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
# features = last_hidden_states[0][:,0,:].numpy()

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(input_ids, labels)

In [None]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scores: ', grid_search.best_score_)

KeyboardInterrupt: ignored

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

0.2413647257859862

In [None]:
# Evaluate the model on the test set
y_pred = lr_clf.predict(test_features)
accuracy = accuracy_score(test_labels, y_pred)
report = classification_report(test_labels, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.2413647257859862
Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.07      0.10      1133
           1       0.24      0.37      0.29      1386
           2       0.06      0.02      0.03       226
           3       0.18      0.04      0.07       935
           4       0.26      0.57      0.36      1775
           5       0.16      0.07      0.10       624
           6       0.16      0.02      0.04      1014

    accuracy                           0.24      7093
   macro avg       0.18      0.17      0.14      7093
weighted avg       0.21      0.24      0.19      7093



In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.246 (+/- 0.00)


In [None]:
# #hide_output
# from transformers import pipeline

# # Change `transformersbook` to your Hub username
# model_id = "transformersbook/distilbert-base-uncased-finetuned-emotion"
# classifier = pipeline("text-classification", model=model_id)