In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from sklearn.neighbors import NearestNeighbors
from joblib import dump, load

# import spacy
# nlp = spacy.load("en_core_web_md", disable=['parser', 'tagger', 'ner'])

In [3]:
# Load data
file = '../../data/spotify_songs.csv'
df = pd.read_csv(file)
df.shape

(18454, 25)

In [4]:
# Get only English songs
songs = df[df.language == 'en']
songs.shape

(15405, 25)

In [5]:
# Check for null values
songs.isnull().sum().sum() == 0

True

In [10]:
def clean_data(data):
    # Remove non-alphanumeric characters
    data = data.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

    # Remove extra whitespace and lowercase text 
    data = data.apply(lambda x: ' '.join(x.lower().split()))

    # Remove short words
    data = data.apply(lambda x: ' '.join(x for x in x.split() if len(x) > 2))
    
    # Stop words will be removed in vectorizer
    return data

In [14]:
# Add cleaned lyrics to df
songs['clean_lyrics'] = clean_data(songs['lyrics'])
songs.clean_lyrics.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1    the trees are singing the wind the sky blue on...
2    yeah spyderman and freeze full effect huh you ...
3    really can stay baby cold outside got away bab...
4    get out business you don keep from turning wit...
5    hold your breath don look down keep trying dar...
Name: clean_lyrics, dtype: object

In [15]:
# Tokenizer function
def tokenizer(song):
    # Create a list of tokens
    tokens = []
    # Split song into words
    words = song.split()
    # Iterate through the words in the song
    for word in words:
        tokens.append(word)
          
    return tokens

In [16]:
# Tokenize clean lyrics
songs['tokens'] = songs.clean_lyrics.apply(tokenizer)
songs['tokens'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1    [the, trees, are, singing, the, wind, the, sky...
2    [yeah, spyderman, and, freeze, full, effect, h...
3    [really, can, stay, baby, cold, outside, got, ...
4    [get, out, business, you, don, keep, from, tur...
5    [hold, your, breath, don, look, down, keep, tr...
Name: tokens, dtype: object

In [17]:
def count(tokens):
    """
    Calculates some basic statistics about tokens in our corpus (i.e. corpus means collections text data)
    """
    # stores the count of each token
    word_counts = Counter()
    
    # stores the number of docs that each token appears in 
    appears_in = Counter()

    total_docs = len(tokens)

    for token in tokens:
        # stores count of every appearance of a token 
        word_counts.update(token)
        # use set() in order to not count duplicates, thereby count the num of docs that each token appears in
        appears_in.update(set(token))

    # build word count dataframe
    temp = zip(word_counts.keys(), word_counts.values())
    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    # rank the the word counts
    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    # calculate the percent total of each token
    wc['pct_total'] = wc['count'].apply(lambda token_count: token_count / total)

    # calculate the cumulative percent total of word counts 
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    # create dataframe for document stats
    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
    
    # merge word count stats with doc stats
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)

    return wc.sort_values(by='rank')

In [18]:
wc  = count(songs['tokens'])
wc.head()

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
50,you,14166,278601,1.0,0.057869,0.057869,0.919572
4,the,14535,222028,2.0,0.046118,0.103988,0.943525
24,and,13634,119728,3.0,0.024869,0.128857,0.885037
35,that,11362,73654,4.0,0.015299,0.144156,0.737553
6,your,10233,58122,5.0,0.012073,0.156229,0.664265


In [19]:
# TFIDF vectorizer
tfidf = TfidfVectorizer(
    stop_words='english', ngram_range=(1,2),
    min_df=5, max_df=0.2,
    max_features=1000,
    tokenizer=tokenizer)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(songs.clean_lyrics)
dtm = pd.DataFrame(data=dtm.toarray(), columns=tfidf.get_feature_names())

In [21]:
dtm.head()

Unnamed: 0,act,act like,actin,afraid,ahead,ahh,ain gonna,ain got,air,alive,...,yeah know,yeah love,yeah ooh,yeah yeah,year,years,yes,yesterday,young,yuh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.104242,0.0,0.0,0.223496,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
print(len(songs))
songs['tokens'].apply(len).mean()


15405


312.51567672833494

In [25]:
# Nearest neighbors model 
nn = NearestNeighbors(n_neighbors=6, algorithm="kd_tree")

# Fit on DTM
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                 radius=1.0)

In [26]:
# sample a doc from dtm to use as our query point 
n = 795
doc_vector = [dtm.iloc[n]]

# Query Using kneighbors 
neigh_dist, neigh_ind = nn.kneighbors(doc_vector)

In [28]:
# Display test song and nearest neighbors
print('Test song:', songs.iloc[n]['track_name'])
print(f'https://open.spotify.com/track/{songs.iloc[n].track_id}')
print('\nPredictions:')

for i in range(6):
    ind = neigh_ind[0][i]
    if ind != n:
        track_name = songs.iloc[ind]['track_name']
        artist = songs.iloc[ind]['track_artist']
        lyrics = songs.iloc[ind]['lyrics']
        print(f'{track_name} by {artist}')
        print(f'https://open.spotify.com/track/{songs.iloc[ind].track_id}')
    print(lyrics)
    # print('\n')

Test song: We Start Fires
https://open.spotify.com/track/0hsMhZA3gguNuHOcDsGmOf

Predictions:
High, high hopes Had to have high, high hopes for a living Shooting for the stars when I couldn't make a killing Didn't have a dime but I always had a vision Always had high, high hopes (High, high hopes) Had to have high, high hopes for a living Didn't know how but I always had a feeling I was gonna be that one in a million Always had high, high hopes Mama said, fulfill the prophecy Be something greater, go make a legacy Manifest destiny, back in the days We wanted everything, wanted everything Mama said, burn your biographies Rewrite your history, light up your wildest dreams Museum victories, every day We wanted everything, wanted everything Mama said don't give up, it's a little complicated All tied up, no more love and I'd hate to see you waiting Had to have high, high hopes for a living Shooting for the stars when I couldn't make a killing Didn't have a dime but I always had a vision Alw