# DACS Elevator Radio Producer

In [None]:
# Create a new conda virtual environment
#!conda create -n nlp_project7 python=3.9
#!conda activate nlp_project7

# Install all relied libraries
#!pip install -r requirements.txt

## Get data from billboard top 100
If this year's data is stored then it won't grab again. You can directly use them.

In [None]:
# Change this to start our journey:
address = '2019'

In [None]:
from data.base import store_lyrics

store_lyrics(year = address)

## Get data from a playlist from spotify
### How to find playlist ID:
![playlist id](imgs/playlist_id.png)

Or if you share the playlist with link, find the string before `?si=`:  
https://open.spotify.com/playlist/**37i9dQZF1DX5Ejj0EkURtP**?si=a1e0243dd67c4cc3


In [None]:
# Change this to start our journey:
address = '4E4kp49bDhaSjyGFOyMKuz'

In [None]:
from data.base import store_lyrics

store_lyrics(playlist_id=address)

## Pre-process:

In [None]:
from data.base import read_cleaned_data

#address = "2019"
lyrics_tokens = read_cleaned_data(address,remove_stopwords = True, stem_words = True)

In [None]:
name = list(lyrics_tokens.keys())[15]
print(name)
print(lyrics_tokens[name])

## Get topics:

In [None]:
from model.base import get_keywords

result = get_keywords(lyrics_tokens)
#result = get_keywords(lyrics_tokens, model = 'bert', n_gram=(1,1), word_no=5)

### Store the extraction result

In [None]:
from data.base import store_extraction_result

df = store_extraction_result(result, f"{address}.csv")
df

## Cluster and visualizaton
### First: We choose the first keyword for every song and visualize them. So we have 100 keywords for 100 songs.    
The size of circle is the frequency of the word appear in 100 keywords.  

### Use lyrics as training data to train the word2vec model

In [None]:
# Train our own word2vec
train_data = []
for a in range(2018,2023):
    l = read_cleaned_data(a,stem_words = False)
    train_data.extend(sum(list(l.values()), []))

print(train_data[0])

In [None]:
from model import w2v

model = w2v.train_wvmodel(train_data)
w2v.save_wvmodel(model)

In [None]:
from gensim.models import Word2Vec
model = Word2Vec.load("result/our_model.model").wv

### Use pre-trained word2vec model

In [None]:
from model import cluster
import pandas as pd

df = pd.read_csv(f"./result/{address}.csv")
cluster.plot_one_per_song(df)
#cluster.plot_one_per_song(df,cluster_no=8, wv_model = model)

Another way:  
We choose the first 5 keywords for every song and sort them by frequency. So we have 500 keywords for 100 songs and we visualize the first 100 in the plot.    
The size of circle is the frequency of the word appear in 100 keywords.

In [None]:
cluster.plot_five_per_song(df, cluster_no=8)
#cluster.plot_five_per_song(df,cluster_no=8, wv_model = model)

### Other visualization

In [None]:
import pandas as pd
from data import analysis
from model import cluster

words = cluster.generate_word_list(pd.read_csv(f"./result/{address}.csv"), 5)
analysis.plot_word_cloud(words)

## Evaluation

In [None]:
from evaluation.title_compare import get_score
import pandas as pd

df = pd.read_csv("result/experiments/bert_ff.csv")
get_score(df)

In [None]:
from data.base import read_cleaned_data
from evaluation.cv_umass import evaluate
import pandas as pd

address = 2018

# u_mass test with our corpus
model_df = pd.read_csv(f"./result/experiments/tfidf_ff.csv")

lyrics_tokens = read_cleaned_data(address,remove_stopwords = False, stem_words = False)

print(evaluate(model_df,lyrics_tokens))
# evaluate(model_df,lyrics_tokens,method='umass')