# Full Processing

The following notebook is used to read in the original ~3.5 million datapoint set of english songs between 1900-2023 and transform it into a training and testing set with all the data fully processed. 

1) Views are converted into 20 discrete categories \
2) Features are dropped \
3) Artists are label-encoded (post-sorting => based on view count) \
4) Tags are one-hot encoded \
5) Titles and Lyrics are embedded \
6) Views are one-hot encoded for classification task \
7) Data is split into training and testing sets

In [None]:
import numpy as np
import pandas as pd
import pyarrow.feather as feather
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

In [None]:
data = pd.read_feather('data_feather_EncodedViews')

### Views are discritized

In [None]:
data.sort_values(by=['views'], inplace=True, ignore_index=True)

views_list = data['views'].to_list()
views_list.sort()
print(len(views_list))
max_views = views_list[-1]
min_views = views_list[0]
print(max_views)
print(min_views)
print(np.log10(min_views))
upper_bounds = np.logspace(0, np.log10(max_views), num=21)[1:]
upper_bounds = np.ceil(upper_bounds)
print(upper_bounds)
mappings = {}
for i in range(len(upper_bounds)):
    mappings[upper_bounds[i]] = i+1
print(mappings)

encoded_views = []
for v in views_list:
    for u in upper_bounds:
        if v <= u:
            encoded_views.append(mappings[u])
            break

data['encoded_views'] = np.array(encoded_views)

### Features dropped

In [None]:
data = data.drop(columns=['features'])

### Years converted, tags encoded, artists encoded

In [None]:
data['age'] = 2023 - data['year']
data = data.drop(columns=['year'])

In [None]:
tag_ohe = OneHotEncoder()
tagOHE = tag_ohe.fit_transform(data['tag'].values.reshape(-1, 1))
tagOH = pd.DataFrame(tagOHE.todense())

In [None]:
def rename_columns(df, s):
    new_columns = ["{}{}".format(s ,i) for i in range(len(df.columns))]
    df.columns = new_columns
    return df

tagOH = rename_columns(tagOH, "tag")

In [None]:
data.sort_values(by=['views'], inplace=True, ignore_index=True)

# LabelEncoding Fitting
arts = data['artist']
var = 0
dict = {}
for a in arts: 
    if a not in dict.keys():
        dict[a] = var
        var+=1
        
# Transformming
data['artist'] = data['artist'].map(dict)

### Embedding the titles and lyrics

In [None]:
tokenizerNLP = AutoTokenizer.from_pretrained("juliensimon/autonlp-song-lyrics-18753417")

In [None]:
# lyrics_tokenization 
lyrics = data['lyrics'].tolist()
lyrics_tokens = tokenizerNLP(lyrics, padding='max_length', max_length= 512, truncation=True)["input_ids"]

In [None]:
data['lyricsTok'] = lyrics_tokens

In [None]:
# Titles_tokenization
titles = data['title'].tolist()
tokens_titles = tokenizerNLP(titles, padding='max_length', max_length= 35, truncation=True)["input_ids"]

In [None]:
data['titleTok'] = tokens_titles

In [None]:
title_embeddings = np.vstack(data["titleTok"].values)
title_embeddings = pd.DataFrame(title_embeddings)
title_embeddings = rename_columns(title_embeddings, "title")

In [None]:
lyrics_embeddings = np.vstack(data["lyricsTok"].values)
lyrics_embeddings = pd.DataFrame(lyrics_embeddings)
lyrics_embeddings = rename_columns(lyrics_embeddings, "lyrics")

### Views are one-hot encoded

In [None]:
ohe = OneHotEncoder()
df_view = ohe.fit_transform(data['encoded_views'].values.reshape(-1, 1))
df_views = pd.DataFrame(df_view.todense())

df_view = rename_columns(df_views, "view")

### Split into train/test and write to feather

In [None]:
train_idx, test_idx = train_test_split(list(range(0, len(data))), test_size=0.2, random_state=42)

In [None]:
data_else_training = data[['age', 'encoded_views', 'tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5']].loc[train_idx]
data_else_testing = data[['age', 'encoded_views', 'tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5']].loc[test_idx]

data_lyrics_training = lyrics_embeddings.loc[train_idx]
data_lyrics_testing = lyrics_embeddings.loc[test_idx]

data_titles_training = title_embeddings.loc[train_idx]
data_titles_testing = title_embeddings.loc[test_idx]

In [None]:
feather.write_feather(data_else_training, 'data_else_training')
feather.write_feather(data_else_testing, 'data_else_testing')
feather.write_feather(data_lyrics_training, 'data_lyrics_training')
feather.write_feather(data_lyrics_testing, 'data_lyrics_testing')
feather.write_feather(data_titles_training, 'data_titles_training')
feather.write_feather(data_titles_testing, 'data_titles_testing')