In [33]:
import numpy as np
import pandas as pd

#for top-5-similar songs recommender
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity


#for text preprocessing:
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
df = pd.read_csv('../data/labeled_lyrics_w_genres.csv')

In [35]:
print("Number of nulls: ", df.isnull().sum().sum())
print("Number of duplicates: ", df.duplicated().sum())
print("df shape: ", df.shape)
print("\n")
print("df value counts: \n")
print(df.genre.value_counts())
df.head()

Number of nulls:  0
Number of duplicates:  0
df shape:  (145250, 7)


df value counts: 

Pop          57357
No_genre     42789
Rock         26756
Country       7440
Rap           5959
R&B           4773
Non-Music      176
Name: genre, dtype: int64


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,artist,seq,song,label,genre
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B


In [36]:
df_dropped = df[(df['genre'] == 'No_genre') | (df['genre'] == 'Non-Music')].index
df.drop(df_dropped, inplace=True, axis='index')

In [37]:
print(df.shape)
print(df.genre.value_counts())
df.head(15)

(102285, 7)
Pop        57357
Rock       26756
Country     7440
Rap         5959
R&B         4773
Name: genre, dtype: int64


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,artist,seq,song,label,genre
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B
5,5,5,Elijah Blake,I just want to ready your mind\r\n'Cause I'll ...,Uno,0.321,R&B
7,7,7,Elis,Dieses ist lange her.\r\nDa ich deine schmalen...,Abendlied,0.333,Pop
8,8,8,Elis,A child is born\r\nOut of the womb of a mother...,Child,0.506,Pop
9,9,9,Elis,Out of the darkness you came \r\nYou looked so...,Come to Me,0.179,Pop
10,10,10,Elis,Each night I lie in my bed \r\nAnd I think abo...,Do You Believe,0.209,Pop


In [38]:
# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_stopwords(test_string):
    # Break the sentence down into a list of words
    words = word_tokenize(test_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

# 4. function to break words into their stem words
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [39]:
# Pipeline function 

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_punc(a_string)
    #a_string = stem_words(a_string) #removing stem_words for now because making lyrics gibberish
    a_string = remove_stopwords(a_string)
    return a_string

In [40]:
# apply preprocessing pipeline 

df['clean_lyrics'] = df['seq'].apply(text_processing_pipeline)

In [41]:
print("Number of nulls: ", df.isnull().sum().sum())
print("Number of duplicates: ", df.duplicated().sum())
print("df shape: ", df.shape)
print("\n")
print("df value counts: \n")
print(df.genre.value_counts())
df.head()

Number of nulls:  0
Number of duplicates:  0
df shape:  (102285, 8)


df value counts: 

Pop        57357
Rock       26756
Country     7440
Rap         5959
R&B         4773
Name: genre, dtype: int64


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,artist,seq,song,label,genre,clean_lyrics
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop,drinks go smoke goes feel got let go cares get...
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B,trippin grigio mobbin lights low trippin grigi...
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [56]:
#making a new column combining song by arist for results dashboard  

df['song_by_artist'] = df[['song', 'artist']].agg(' by '.join, axis=1)

In [57]:
#simplified dataframe

df_new = df[['song_by_artist', 'artist', 'song', 'label', 'genre', 'clean_lyrics']]

In [58]:
df_new.head()

Unnamed: 0,song_by_artist,artist,song,label,genre,clean_lyrics
0,Everyday by Elijah Blake,Elijah Blake,Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,Live Till We Die by Elijah Blake,Elijah Blake,Live Till We Die,0.63,Pop,drinks go smoke goes feel got let go cares get...
2,The Otherside by Elijah Blake,Elijah Blake,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,Pinot by Elijah Blake,Elijah Blake,Pinot,0.536,R&B,trippin grigio mobbin lights low trippin grigi...
4,Shadows & Diamonds by Elijah Blake,Elijah Blake,Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [59]:
df_new.genre.value_counts()

Pop        57357
Rock       26756
Country     7440
Rap         5959
R&B         4773
Name: genre, dtype: int64

In [60]:
df_new.isnull().sum()

song_by_artist    0
artist            0
song              0
label             0
genre             0
clean_lyrics      0
dtype: int64

In [61]:
missing_values = df.isnull().values.any()
if(missing_values):
    display(df[df.isnull().any(axis=1)])

In [62]:
df_new.head()

Unnamed: 0,song_by_artist,artist,song,label,genre,clean_lyrics
0,Everyday by Elijah Blake,Elijah Blake,Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,Live Till We Die by Elijah Blake,Elijah Blake,Live Till We Die,0.63,Pop,drinks go smoke goes feel got let go cares get...
2,The Otherside by Elijah Blake,Elijah Blake,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,Pinot by Elijah Blake,Elijah Blake,Pinot,0.536,R&B,trippin grigio mobbin lights low trippin grigi...
4,Shadows & Diamonds by Elijah Blake,Elijah Blake,Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [64]:
#exporting 

df_new.to_csv('preprocessed_dataset.csv')