In [1]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd 
import re
import datetime

In [26]:
# Unicode, Regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# Pandas dataframe manipulation
import pandas as pd
# Time formatting
from time import strftime

# Quieeet!!! Y'all can't stop me now...
import warnings
warnings.filterwarnings('ignore')

import numpy as np

################### BASIC CLEAN ###################

def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
            .encode('ascii', 'ignore')\
            .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

################### TOKENIZE ###################

def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)    
    return string

################### FUNCTIONS ###################

def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)    
    return string_without_stopwords

################### STEM ###################

def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.a
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)    
    return string

################### LEMMATIZE ###################

def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)    
    return string

################### CLEAN DATAFRAME ###################

def clean_df(df, extra_words = [], exclude_words = []):
    # pull the data
    df = pd.read_json('data.json')
    # drops nulls
    df.dropna(inplace = True)
    # add clean column that applies basic clean function
    df['clean'] = df.lyrics.apply(basic_clean).apply(remove_stopwords)
    # tokenize df applied after running tokenize function
    tokenized_df = df.clean.apply(tokenize)
    # stemmed column created from stem function
    df['stemmed'] = tokenized_df.apply(stem)
    # lemmatized column created from lemmatize function
    df['lemmatized'] = tokenized_df.apply(lemmatize)
    # create columns with character and word counts
    df = df.assign(character_count= df.lemmatized.str.len(), 
             word_count=df.lemmatized.str.split().apply(len))
    return df

In [3]:
df = pd.read_csv('songs.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,date,status,lyrics
0,0,"""B"" Girls",Young And Restless,1990-05-05,,
1,1,"""Cherry Cherry"" from Hot August Night",Neil Diamond,1973-03-17,an error ocurred,
2,2,"""Having A Party"" Medley",The Ovations (Featuring Louis Williams),1973-10-13,an error ocurred,
3,3,"""Joy"" Pt. I",Isaac Hayes,1973-12-22,lyrics acquired,"12/12 Songs LyricsAAlex Turner - ""Stuck on the..."
4,4,"""Roots"" Medley",Quincy Jones,1977-03-05,lyrics acquired,Ulysses (Chap. 15 - Circe) LyricsThe Mabbot st...


### Preparing the data:

In [4]:
# drops nulls
df.dropna(inplace = True)


In [5]:
# add clean column that applies basic clean function
df['clean'] = df.lyrics.apply(basic_clean).apply(remove_stopwords)


In [6]:
# tokenize df applied after running tokenize function
tokenized_df = df.clean.apply(tokenize)


In [7]:
# stemmed column created from stem function
df['stemmed'] = tokenized_df.apply(stem)


In [8]:
# # lemmatized column created from lemmatize function
# df['lemmatized'] = tokenized_df.apply(lemmatize)


In [10]:
# create columns with character and word counts
df = df.assign(character_count= df.lemmatized.str.len(), 
         word_count=df.lemmatized.str.split().apply(len))

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,date,status,lyrics,clean,stemmed,lemmatized,character_count,word_count
3,3,"""Joy"" Pt. I",Isaac Hayes,1973-12-22,lyrics acquired,"12/12 Songs LyricsAAlex Turner - ""Stuck on the...",1212 songs lyricsaalex turner stuck puzzle alv...,1212 song lyricsaalex turner stuck puzzl alvva...,1212 song lyricsaalex turner stuck puzzle alvv...,4406,667
4,4,"""Roots"" Medley",Quincy Jones,1977-03-05,lyrics acquired,Ulysses (Chap. 15 - Circe) LyricsThe Mabbot st...,ulysses chap 15 circe lyricsthe mabbot street ...,ulyss chap 15 circ lyricsth mabbot street entr...,ulysses chap 15 circe lyricsthe mabbot street ...,155071,22910
6,6,#1,Nelly,2001-10-20,lyrics acquired,#1 LyricsUh uh uh I just gotta bring it to the...,1 lyricsuh uh uh gotta bring attention dirty t...,1 lyricsuh uh uh gotta bring attent dirti that...,1 lyricsuh uh uh gotta bring attention dirty t...,2174,396
7,7,#1 Dee Jay,Goody Goody,1978-11-18,lyrics acquired,Lbo3d l’akhar LyricsVerse 1: (Omar Cravate) B...,lbo3d lakhar lyricsverse 1 omar cravate badin ...,lbo3d lakhar lyricsvers 1 omar cravat badin bf...,lbo3d lakhar lyricsverse 1 omar cravate badin ...,3140,555
8,8,#9 Dream,John Lennon,1974-12-21,lyrics acquired,#9 Dream Lyrics[Verse 1] So long ago Was it in...,9 dream lyricsverse 1 long ago dream dream kno...,9 dream lyricsvers 1 long ago dream dream know...,9 dream lyricsverse 1 long ago dream dream kno...,942,152


In [25]:
df[df['artist'] == 'Michael Jackson'].value_counts()

Unnamed: 0  title                           artist           date        status           lyrics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        