In [27]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.neighbors import NearestNeighbors
# from joblib import dump, load

import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Load data
file = '../../data/spotify_songs.csv'
df = pd.read_csv(file)
df.shape

(18454, 25)

In [4]:
# Get only English songs
songs = df[df.language == 'en']
songs.shape

(15405, 25)

In [5]:
# Check for null values
songs.isnull().sum().sum() == 0

True

In [6]:
def clean_data(data):
    # Remove non-alphanumeric characters
    data = data.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

    # Remove extra whitespace and lowercase text 
    data = data.apply(lambda x: ' '.join(x.lower().split()))

    # Remove short words
    data = data.apply(lambda x: ' '.join(x for x in x.split() if len(x) > 2))
    
    # Stop words will be removed in vectorizer
    return data

In [7]:
# Add cleaned lyrics to df
songs['clean_lyrics'] = clean_data(songs['lyrics'])
songs.clean_lyrics.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1    the trees are singing the wind the sky blue on...
2    yeah spyderman and freeze full effect huh you ...
3    really can stay baby cold outside got away bab...
4    get out business you don keep from turning wit...
5    hold your breath don look down keep trying dar...
Name: clean_lyrics, dtype: object

In [8]:
# Tokenizer function
def tokenizer(song):
    # Create a list of tokens
    tokens = []
    # Split song into words
    words = song.split()
    # Iterate through the words in the song
    for word in words:
        tokens.append(word)
          
    return tokens

In [9]:
# Tokenize clean lyrics
songs['tokens'] = songs.clean_lyrics.apply(tokenizer)
songs['tokens'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1    [the, trees, are, singing, the, wind, the, sky...
2    [yeah, spyderman, and, freeze, full, effect, h...
3    [really, can, stay, baby, cold, outside, got, ...
4    [get, out, business, you, don, keep, from, tur...
5    [hold, your, breath, don, look, down, keep, tr...
Name: tokens, dtype: object

In [10]:
def count(tokens):
    """
    Calculates some basic statistics about tokens in our corpus (i.e. corpus means collections text data)
    """
    # stores the count of each token
    word_counts = Counter()
    
    # stores the number of docs that each token appears in 
    appears_in = Counter()

    total_docs = len(tokens)

    for token in tokens:
        # stores count of every appearance of a token 
        word_counts.update(token)
        # use set() in order to not count duplicates, thereby count the num of docs that each token appears in
        appears_in.update(set(token))

    # build word count dataframe
    temp = zip(word_counts.keys(), word_counts.values())
    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    # rank the the word counts
    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    # calculate the percent total of each token
    wc['pct_total'] = wc['count'].apply(lambda token_count: token_count / total)

    # calculate the cumulative percent total of word counts 
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    # create dataframe for document stats
    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
    
    # merge word count stats with doc stats
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)

    return wc.sort_values(by='rank')

In [11]:
wc  = count(songs['tokens'])
wc.head()

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
43,you,14166,278601,1.0,0.057869,0.057869,0.919572
21,the,14535,222028,2.0,0.046118,0.103988,0.943525
34,and,13634,119728,3.0,0.024869,0.128857,0.885037
28,that,11362,73654,4.0,0.015299,0.144156,0.737553
13,your,10233,58122,5.0,0.012073,0.156229,0.664265


In [12]:
# TFIDF vectorizer
tfidf = TfidfVectorizer(
    stop_words='english', ngram_range=(1,2),
    min_df=5, max_df=0.2,
    max_features=1000,
    tokenizer=tokenizer)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(songs.clean_lyrics)
dtm = pd.DataFrame(data=dtm.toarray(), columns=tfidf.get_feature_names())
print(dtm.shape)
dtm.head()

(15405, 1000)


Unnamed: 0,act,act like,actin,afraid,ahead,ahh,ain gonna,ain got,air,alive,...,yeah know,yeah love,yeah ooh,yeah yeah,year,years,yes,yesterday,young,yuh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.104242,0.0,0.0,0.223496,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Nearest neighbors model 
nn = NearestNeighbors(n_neighbors=6, algorithm="kd_tree")

# Fit on DTM
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', n_neighbors=6)

In [15]:
# sample a doc from dtm to use as our query point 
n = 4795
doc_vector = [dtm.iloc[n]]

# Query Using kneighbors 
neigh_dist, neigh_ind = nn.kneighbors(doc_vector)

In [None]:
# Display test song and nearest neighbors
print('Test song:', songs.iloc[n]['track_name'])
print(f'https://open.spotify.com/track/{songs.iloc[n].track_id}')
print(songs.iloc[n]['lyrics'])
print('\nPredictions:')

for i in range(6):
    ind = neigh_ind[0][i]
    if ind != n:
        track_name = songs.iloc[ind]['track_name']
        artist = songs.iloc[ind]['track_artist']
        lyrics = songs.iloc[ind]['lyrics']
        print(f'{track_name} by {artist}')
        print(f'https://open.spotify.com/track/{songs.iloc[ind].track_id}')
        print(lyrics, '\n')
    # print('\n')

In [16]:
dtm.shape

(15405, 1000)

In [32]:
# Autoencoder

encoded_dim = 32
layer_1 = 64
layer_2 = 128

input_doc = Input(shape = (dtm.shape[1], ))

x = Dense(layer_1, activation = 'relu')(input_doc)
x = Dropout(0.25)(x)
x = Dense(layer_2, activation = 'relu')(x)
encoded = Dense(encoded_dim, activation = 'relu')(x)
x = Dense(layer_2, activation='sigmoid')(encoded)
x = Dense(layer_1, activation = 'relu')(x)
x = Dropout(0.25)(x)
decoded = Dense(dtm.shape[1], activation='sigmoid')(x)

autoencoder = Model(input_doc, decoded)

encoder = Model(input_doc, encoded)

autoencoder.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

In [31]:
autoencoder.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                64064     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_18 (Dense)             (None, 32)                4128      
_________________________________________________________________
dense_19 (Dense)             (None, 128)               4224      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0   

In [29]:
stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3)

autoencoder.fit(dtm, # input image to encoder
                dtm, # provide input image to decoder so the model learns how to reconstruct the input image 
                batch_size=32,
                epochs=100,
                validation_split=.2,
                callbacks=[stop])

Train on 12324 samples, validate on 3081 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<tensorflow.python.keras.callbacks.History at 0x7fb095148e48>