In [22]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd 
import re
import datetime

In [2]:
# Unicode, Regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# Pandas dataframe manipulation
import pandas as pd
# Time formatting
from time import strftime

# Quieeet!!! Y'all can't stop me now...
import warnings
warnings.filterwarnings('ignore')

import numpy as np

################### BASIC CLEAN ###################

def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
            .encode('ascii', 'ignore')\
            .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

################### TOKENIZE ###################

def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)    
    return string

################### FUNCTIONS ###################

def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)    
    return string_without_stopwords

################### STEM ###################

def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.a
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)    
    return string

################### LEMMATIZE ###################

def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)    
    return string

################### CLEAN DATAFRAME ###################

def clean_df(df, extra_words = [], exclude_words = []):
#     # pull the data
#     df = pd.read_json('data.json')
    # drops nulls
    df.dropna(inplace = True)
    # add clean column that applies basic clean function
    df['clean'] = df.lyrics.apply(basic_clean).apply(remove_stopwords)
    # tokenize df applied after running tokenize function
    tokenized_df = df.clean.apply(tokenize)
    # stemmed column created from stem function
    df['stemmed'] = tokenized_df.apply(stem)
    # lemmatized column created from lemmatize function
#     df['lemmatized'] = tokenized_df.apply(lemmatize)
#     # create columns with character and word counts
    df = df.assign(character_count= df.stemmed.str.len(), 
             word_count=df.stemmed.str.split().apply(len))
    return df

In [3]:
df = pd.read_csv('songs.csv')
df.shape

(29681, 6)

### Preparing the data:

In [4]:
df = clean_df(df, extra_words = [], exclude_words = [])

In [27]:
df.to_datetime(df['date'])
df.head()

AttributeError: 'DataFrame' object has no attribute 'to_datetime'

In [17]:
(df[df['artist']=='Bobby Darin'])

Unnamed: 0.1,Unnamed: 0,title,artist,date,status,lyrics,clean,stemmed,character_count,word_count
300,300,18 Yellow Roses,Bobby Darin,1963-05-11,lyrics acquired,18 Yellow Roses LyricsEighteen yellow roses ca...,18 yellow roses lyricseighteen yellow roses ca...,18 yellow rose lyricseighteen yellow rose came...,487,85
1741,1741,Artificial Flowers,Bobby Darin,1960-09-26,lyrics acquired,Artificial Flowers LyricsAlone in the world Wa...,artificial flowers lyricsalone world poor litt...,artifici flower lyricsalon world poor littl an...,941,151
1931,1931,Baby Face,Bobby Darin,1962-09-29,lyrics acquired,Baby Face LyricsBaby face You've got the cutes...,baby face lyricsbaby face youve got cutest lit...,babi face lyricsbabi face youv got cutest litt...,465,86
2335,2335,Be Mad Little Girl,Bobby Darin,1963-11-23,lyrics acquired,"Be Mad Little Girl LyricsAh, ... be mad little...",mad little girl lyricsah mad little girl mad l...,mad littl girl lyricsah mad littl girl mad lit...,468,92
2663,2663,Beyond The Sea,Bobby Darin,1960-01-18,lyrics acquired,Beyond the Sea LyricsSomewhere beyond the sea ...,beyond sea lyricssomewhere beyond sea somewher...,beyond sea lyricssomewher beyond sea somewher ...,486,89
4131,4131,Child Of God,Bobby Darin,1960-12-26,lyrics acquired,Child of God LyricsIf anybody asked you who I ...,child god lyricsif anybody asked anybody asked...,child god lyricsif anybodi ask anybodi ask tel...,610,111
4179,4179,Christmas Auld Lang Syne,Bobby Darin,1960-12-19,lyrics acquired,Christmas Auld Lang Syne LyricsWhen mistletoe ...,christmas auld lang syne lyricswhen mistletoe ...,christma auld lang syne lyricswhen mistleto ti...,464,78
4244,4244,Clementine,Bobby Darin,1960-03-21,lyrics acquired,Clementine LyricsIn a cavern down by a canyon ...,clementine lyricsin cavern canyon excavatin mi...,clementin lyricsin cavern canyon excavatin min...,727,124
5142,5142,Darling Be Home Soon,Bobby Darin,1967-07-29,lyrics acquired,Darling Be Home Soon LyricsCome and talk of al...,darling home soon lyricscome talk things today...,darl home soon lyricscom talk thing today hear...,622,113
6259,6259,Dream Lover,Bobby Darin,1959-04-20,lyrics acquired,Dream Lover Lyrics[Verse 1] Every night I hope...,dream lover lyricsverse 1 every night hope pra...,dream lover lyricsvers 1 everi night hope pray...,938,169
