In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, Counter
from langdetect import detect, DetectorFactory
import multiprocessing
import os
import pickle

This notebook is used to create Latent Semantic Indexing objects, TFIDF Vectorizers, and lyric cleaning.
This notebook creates pickle files that the application uses.

Pickle files are used to save runtime during search. The pickle files are pre-generated allowing us to go directly into using them for search.
This also allows us to store the concept vectors instead of the full lyrics greatly reducing memory usage.

NOTE: it is not recommended to change the pickle files saved unless you have new track data that will replace the original pickle files.


In [5]:
# functions for generating LSI, TFIDF and Lyic Vectors.
def load_lyrics_data():
    """
    return a dataframe with cleaned lyrics
    """
    temp_path = os.getcwd()
    root_path = temp_path.split('/rocchio_records')[0]
    repo_path = '/rocchio_records/lyric_genius_api/practice_data.csv'
    data_path = root_path + repo_path
    # step 1 import old lyrical data into a dataframe.
    lyric_df = pd.read_csv(data_path)
    lyric_df = lyric_df[['track_name', 'artist_name', 'track_id','raw_lyrics']]
    # drop duplicate instances of the same track.
    lyric_df = lyric_df.drop_duplicates(subset='raw_lyrics').reset_index(drop=True)
    lyric_df = lyric_df.dropna()
    # replace new line character
    lyric_df['raw_lyrics'].replace('\n', ' ',regex=True, inplace=True)
    # remove embed text from lyric genius API
    lyric_df['raw_lyrics'].replace('[0-9]{1,3}Embed', '', regex=True, inplace=True)
    return lyric_df

def remove_non_english_tracks(df):
    """ 
    Detect the language of a track
    Drop english tracks
    Return a DF with all english tracks
    """
    df['language'] = df['raw_lyrics'].apply(detect)
    df = df[df.language == 'en']
    return df

def create_lyric_tfidf(lyric_df, min_df):
    """ 
    Create a tfidf vectorizer for the track lyrics
    """
    tfidf = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
    # fit the TFIDF vectorizer
    tfidf.fit(lyric_df['raw_lyrics'])
    return tfidf

def lsi_lyrics(lyric_df, tfidf, num_concepts):
    """
    fit an LSI object using the lyrics
    """
    lyricTermMat = tfidf.transform(lyric_df['raw_lyrics'])
    lsiObj = TruncatedSVD(n_components=num_concepts, random_state=15)
    lsiObj.fit(lyricTermMat)
    return lsiObj

def create_lyric_vecs(lyric_df, lsiObj, tfidf):
    """ 
    generate the lyric vectors in the concept space
    """
    lyricTermMat = tfidf.transform(lyric_df['raw_lyrics'])
    lyric_vecs = lsiObj.transform(lyricTermMat)
    return lyric_vecs

def create_lyric_dictionary(lyric_df, lyric_vecs):
    """ 
    input a lyric dataframe and lsi lyric vectors
    return a dictionary where the track id is the key
    the vector is the value
    """
    # create a dataframe where the track id is the index the docVecs are the rows.
    track_vec_dict = defaultdict(list)
    track_ids = lyric_df['track_id'].values
    track_vec_dict = {track_ids[i]: lyric_vecs[i] for i in range(len(track_ids))}
    return track_vec_dict

def simulate_user_input(user_playlist):
    """ 
    Simulate user input assign 0,1,2 to user feedback
    0 = nuetral
    1 = dislike
    2 = love
    """
    feedback = [np.random.randint(0,3) for i in range(len(user_playlist))]
    user_playlist['feedback'] = feedback
    return user_playlist


# WARNING USING THESE FUNCTIONS CAN OVER WRITE THE ORIGINAL FILES THE APPLICATION USES
# ONLY USES THESE FUNCTIONS IF YOU HAVE NEW TRAINING DATA
def create_artist_track_id_df(test_df):
    """
    Write a DF to csv for running in the application.
    """
    track_artist_id_df = test_df[['track_name', 'artist_name', 'track_id']].drop_duplicates()
    track_artist_id_df.to_csv('track_artist_id_df', index=False)
def create_lsi_dict_pickle(track_vec_dict):
    pickle.dump(track_vec_dict, open('lsi_vec_dict.p', 'wb'))
def create_lsi_obj_pickle(lsiObj):
    pickle.dump(lsiObj, open('lsi_obj.p', 'wb'))
def create_tfidf_obj_pickle(tfidf):
    pickle.dump(tfidf, open('tfidf_obj.p', 'wb'))
def load_lsi_pickle():
    """ 
    read in the pickle file containing the fitted
    LSI object
    """
    with open('lsi_obj.p', 'rb') as lsi_file:
        lsiObj = pickle.load(lsi_file)
        return lsiObj
def load_tfidf_pickle():
    """ 
    read in the pickle file containing the fitted
    TFIDF object
    """
    with open('tfidf_obj.p', 'rb') as tfidf_file:
        tfidf = pickle.load(tfidf_file)
        return tfidf
def load_lsi_dict_pickle():
    """ 
    read in the pickle file containing the fitted
    TFIDF object
    """
    with open('lsi_vec_dict.p', 'rb') as tfidf_file:
        tfidf = pickle.load(tfidf_file)
        return tfidf


In [6]:
# call the above functions to show how the LSI vector generation works.
# load in lyrics dataframe
lyric_df = load_lyrics_data()
# remove non english tracks:
lyric_df = remove_non_english_tracks(lyric_df)
# create a tfidf vectorizer object
tfidf = create_lyric_tfidf(lyric_df, 10)
# create a latent semantic indexing object
lsiObj = lsi_lyrics(lyric_df, tfidf, 100)
# convert the lyrics into content vectors
lyric_vecs = create_lyric_vecs(lyric_df, lsiObj, tfidf)
# create a dictionary mapping track id to content vector
track_vec_dict = create_lyric_dictionary(lyric_df, lyric_vecs)

