### Dataloader

In [30]:
import pandas as pd
from random import seed, sample
import pickle
import glob
import os

class Loader():

    def __init__(self, in_path, out_path):
        """
        Args:
            in_path (str): csv input path.
            out_path (str): Output directory path to store the pickles.
            chunksize (int, optional): Chunksize for DataFrame reader. Defaults to 10**6. 
        """

        self.__in_path = in_path
        self.__out_path = out_path
        self.__chunksize = 10**6

    def __produce_pickles(self):
        """produce pickles by reading csv by chunksize
        """
        with pd.read_csv(self.__in_path, chunksize = self.__chunksize) as reader:
            try:
                os.makedirs(self.__out_path)
            except FileExistsError:
                # directory already exists
                pass
            for i, chunk in enumerate(reader):
                out_file = self.__out_path + "/data_{}.pkl".format(i+1)
                with open(out_file, "wb") as f:
                    pickle.dump(chunk, f, pickle.HIGHEST_PROTOCOL)
    
    def load_pickle(self, pickle_id):
        """load a pickle file by id

        Args:
            pickle_id (int): pickle id.

        Raises:
            Exception: The path of the given id isn't a file

        Returns:
            obj: DataFrame
        """
        # produce the pickles if the directory not exists or
        # if the directory is empty 
        if (not os.path.exists(self.__out_path)) or \
              (len(os.listdir(self.__out_path)) == 0):
            self.__produce_pickles()
        
        # get the file path following the pickle_id
        # given in parameter
        file_path = self.__out_path + \
            "/data_" + str(pickle_id) + ".pkl"

        if os.path.isfile(file_path):
            df = pd.read_pickle(file_path)
        else:
            raise Exception("The pickle file data_{}.pkl doesn't exist".format(pickle_id))
        return df
        

    def random_pickles(self, n_pickles = 3, init = 42, verbose = True):
        """random reader over pickles files

        Args:
            n_pickles (int, optional): number of pickles to load. Defaults to 3.
            init (int, optional): Integer given to the random seed. Defaults to 42.
            verbose (bool, optional): Print the loaded files. Defaults to True

        Raises:
            Exception: Stop the process if n_pickles exceed pickle files number.

        Returns:
            obj: pd.Dataframe
        """

        # produce the pickles if the directory not exists or
        # if the directory is empty 
        if (not os.path.exists(self.__out_path)) or \
              (len(os.listdir(self.__out_path)) == 0):
            self.__produce_pickles()

        pickle_files = [name for name in
                        glob.glob(self.__out_path + "/data_*.pkl")]
        # draw p_files        
        seed(init)

        if n_pickles <= 6:
            random_p_files = sample(pickle_files, n_pickles)
        else:
            raise Exception("The parameter n_pickles (" +
                            "{}) exceed the numbers of pickle files ({})"\
                                .format(n_pickles, len(pickle_files)))
        # print the drawed files
        if verbose:
            print("Loaded pickles:")
            for p in random_p_files:
                print(p)

        # load random pickles file
        df_list = [pd.read_pickle(p) for p in random_p_files]

        # create the dataframe by concatenate the previous
        # dataframes list
        df = pd.concat(df_list, ignore_index = True)
        return df

In [51]:
loader = Loader(in_path = './input/song_lyrics.csv', out_path = './data')
df_orig = loader.load_pickle(1)
print('Data rows number: ', len(df_orig))

Data rows number:  1000000


### Sample 1% of the dataset (10 000 songs)

In [96]:
df = df_orig.sample(frac=0.01, random_state=1)
print('Data rows number: ', len(df))
df.head()

Data rows number:  10000


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
276826,Etap,rap,Der Plot,2014,124,{},"[Part I - Conny:]\nGuten Morgen fremdes Bett, ...",383522,de,de,de
849425,Toothpick,pop,Biting Elbows,2012,8873,{},Some folks got the patience of the angels\nNot...,1166787,en,en,en
504499,6 Feet Under,pop,Ana Johnsson,2004,60,{},You just left me 6 feet under ground I'm burni...,803057,en,en,en
601054,Ir Al Baile,pop,Onda Vaga,2015,731,{},Cuando a los doce llevé la bandera en el hombr...,905848,es,es,es
980221,Prudenza mai,pop,Ivan Graziani,1989,35,{},"Prudenza mai, mai...\nMai neanche da bambino\n...",1304379,it,it,it


### Drop useless columns and keep only english songs

In [97]:
# consider only english songs
df = df[df.language == 'en']
df = df.dropna()
df = df.drop(['language_cld3', 'language_ft','language','features','views'], axis=1)
df

Unnamed: 0,title,tag,artist,year,lyrics,id
849425,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787
504499,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057
409364,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438
653769,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823
846412,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619
...,...,...,...,...,...,...
54592,Fastest Rap Song,rap,Bone Thugs-N-Harmony,2011,Yeah. runaway statue. Here we gooooo\n\nWay 2 ...,58074
581240,The Sleepless Sailor,pop,Kate Rusby,1999,"I once was a sailor, a young man and brave\nDa...",885191
662322,Locked,pop,Scritti Politti,2006,And when the day is done\nIt's little games ar...,970831
639271,Valvoline,pop,Scout Niblett,2005,I am a driver\nI am a driver\nI am a driver\nI...,946730


### Dataset preprocessing

In [98]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
    # remove \n
    text = text.replace('\n', ' ')
    # remove punctuation
    text = re.sub(r'[,\.!?]', '', text)
    #removing text in square braquet
    text = re.sub(r'\[.*?\]', ' ', text)
    #removing numbers
    text = re.sub(r'\w*\d\w*',' ', text)
    #removing bracket
    text = re.sub(r'[()]', ' ', text)
    # convert all words in lower case
    text = text.lower()
    # tokenize
    tokens = word_tokenize(text)
    # remove stop words
    stop_words = stopwords.words('english')
    new_stop_words = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
    stop_words.extend(new_stop_words)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    #remove tokens with lenght < 3
    final_tokens = [token for token in lemmatized_tokens if len(token) > 2 and not token.isnumeric()]

    return final_tokens

cleaned_text = df["lyrics"].apply(preprocess_text)
df["lyrics_proc"] = cleaned_text
df

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_proc
849425,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787,"[folk, got, patience, angel, heart, well, year..."
504499,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057,"[left, foot, ground, burning, sight, light, fo..."
409364,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438,"[room, lupus, house, enter, lupus, histrio, li..."
653769,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823,"[rat, drain, ditch, caught, limb, know, better..."
846412,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619,"['ll, never, say, never, cause, love, head, he..."
...,...,...,...,...,...,...,...
54592,Fastest Rap Song,rap,Bone Thugs-N-Harmony,2011,Yeah. runaway statue. Here we gooooo\n\nWay 2 ...,58074,"[runaway, statue, gooooo, way, strong-bizzy, b..."
581240,The Sleepless Sailor,pop,Kate Rusby,1999,"I once was a sailor, a young man and brave\nDa...",885191,"[sailor, young, man, brave, dum, day, dum, dee..."
662322,Locked,pop,Scritti Politti,2006,And when the day is done\nIt's little games ar...,970831,"[day, done, little, game, lost, maybe, 'll, cl..."
639271,Valvoline,pop,Scout Niblett,2005,I am a driver\nI am a driver\nI am a driver\nI...,946730,"[driver, driver, driver, driver, driver, drive..."


In [93]:
df.iloc[1]['lyrics_proc']

['say',
 'hello',
 'sweetest',
 'melody',
 'know',
 'sound',
 'angel',
 'singing',
 'soft',
 'low',
 'make',
 'music',
 'laughter',
 'echo',
 'breeze',
 'hush',
 'lark',
 'thrush',
 'tree',
 'calm',
 'wave',
 'rush',
 'sea',
 'make',
 'music',
 'time',
 'breathes',
 'sigh',
 'symphony',
 'begin',
 'every',
 'time',
 'say',
 'goodbye',
 'million',
 'violin',
 'start',
 'cry',
 'song',
 'sad',
 'meet',
 'kiss',
 'song',
 'sweet',
 'whether',
 'far',
 'away',
 'near',
 'make',
 'music',
 'hear',
 'every',
 'time',
 'say',
 'goodbye',
 'million',
 'violin',
 'start',
 'cry',
 'song',
 'sad',
 'meet',
 'kiss',
 'song',
 'sweet',
 'whether',
 'far',
 'away',
 'near',
 'make',
 'music',
 'hear']

In [94]:
df_sample.to_csv("song_lyrics_proc.csv" ,header='true', index=False)
