In [29]:
import pandas as pd
import numpy as np

In [30]:
df = pd.read_csv("Spotify Million Song Dataset_exported.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [31]:
df = df.drop(['link'], axis=1)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


Removing Indentations,apostrophes,punctuations,white spaces

In [32]:
import re
import string

def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.replace('\n', ' ')  # Remove newline characters
    text = text.replace('\r', ' ')  # Remove carriage return characters
    text = text.replace('\t', ' ')  # Remove tab characters
    text = re.sub(r'\b\w+\'\w+\b', '', text)  # Remove words with apostrophes
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

In [33]:
df['text'][0]

"Look at her face, it's a wonderful face  \r\nAnd it means something special to me  \r\nLook at the way that she smiles when she sees me  \r\nHow lucky can one fellow be?  \r\n  \r\nShe's just my kind of girl, she makes me feel fine  \r\nWho could ever believe that she could be mine?  \r\nShe's just my kind of girl, without her I'm blue  \r\nAnd if she ever leaves me what could I do, what could I do?  \r\n  \r\nAnd when we go for a walk in the park  \r\nAnd she holds me and squeezes my hand  \r\nWe'll go on walking for hours and talking  \r\nAbout all the things that we plan  \r\n  \r\nShe's just my kind of girl, she makes me feel fine  \r\nWho could ever believe that she could be mine?  \r\nShe's just my kind of girl, without her I'm blue  \r\nAnd if she ever leaves me what could I do, what could I do?\r\n\r\n"

In [34]:
df['text'] = df['text'].apply(lambda elements: clean_text(elements))

In [35]:
df['text'][0]

'look at her face a wonderful face and it means something special to me look at the way that she smiles when she sees me how lucky can one fellow be just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue and if she ever leaves me what could i do what could i do and when we go for a walk in the park and she holds me and squeezes my hand go on walking for hours and talking about all the things that we plan just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue and if she ever leaves me what could i do what could i do'

In [36]:
df['combined'] = df['artist'] + ' ' + df['song'] + ' ' + df['text']
df['combined'][0]

"ABBA Ahe's My Kind Of Girl look at her face a wonderful face and it means something special to me look at the way that she smiles when she sees me how lucky can one fellow be just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue and if she ever leaves me what could i do what could i do and when we go for a walk in the park and she holds me and squeezes my hand go on walking for hours and talking about all the things that we plan just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue and if she ever leaves me what could i do what could i do"

Tokenization (Bert Tokenization)

In [37]:
import tensorflow as tf

In [38]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoded_inputs = tokenizer(df['combined'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')

In [39]:
# for key, value in encoded_inputs.items():
#     print( '{} : {}'.format( key, value ) )

Removing stopwords

In [40]:
# import nltk
# nltk.download('stopwords')
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords

# stop_words = set(stopwords.words('english'))

Encoding Combined with labelencoder

In [41]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['combined_encoded'] = label_encoder.fit_transform(df['combined'])

In [42]:
print(df['combined'][0])
print(df['combined_encoded'][0])

ABBA Ahe's My Kind Of Girl look at her face a wonderful face and it means something special to me look at the way that she smiles when she sees me how lucky can one fellow be just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue and if she ever leaves me what could i do what could i do and when we go for a walk in the park and she holds me and squeezes my hand go on walking for hours and talking about all the things that we plan just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue and if she ever leaves me what could i do what could i do
93


Train Test split

In [44]:
encoded_inputs_np = encoded_inputs['input_ids'].numpy()

In [45]:
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(encoded_inputs_np, df['combined_encoded'], test_size=0.2, random_state=42)

Creating tensorflow dataset for training and testing sets

In [46]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels))

Batch and shuffle

In [47]:
train_dataset = train_dataset.shuffle(buffer_size=100).batch(8)
test_dataset = test_dataset.batch(8)

In [48]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Fitting into the model

In [50]:
History_log=model.fit(train_dataset, epochs=3, validation_data=test_dataset)

Epoch 1/3
 319/5765 [>.............................] - ETA: 22:05:28 - loss: 10.9864 - accuracy: 0.0000e+00

KeyboardInterrupt: 