In [1]:
import pandas as pd
import numpy as np
import re
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
# Read in each individual genre

Rock = pd.read_csv('Rock.csv')
Country = pd.read_csv('Country.csv').drop(columns = ['Unnamed: 0','Song'])
Rap = pd.read_csv('Rap.csv').drop(columns = ['Unnamed: 0'])
Pop = pd.read_csv('Pop.csv')
RnB = pd.read_csv('RnB.csv')

In [3]:
# Combine genres into one comprehensive dataframe, reset index

Combined_Lyrics = Rock.append(Country, ignore_index = True).append(Rap, ignore_index = True).append(Pop, ignore_index = True).append(RnB, ignore_index = True).drop_duplicates().dropna()
Combined_Lyrics = Combined_Lyrics.reset_index(drop=True)

In [4]:
# Creating a list of stop words, creating a lemmatizer

sw = stopwords.words('english')
sw_special = ["oh","yeah","got","go","get","one","two","three","four","five","six","seven","eight","nine","ten","let",
             "way","cause","like","know","back","uh","ooh","urlcopyembedcopy"]
wn = WordNetLemmatizer()

In [7]:
# Creating function to tokenize, lemmatize and clean data - saving tokens as lists
# removing embedshare tags

def clean_data(lyrics):
    lyrics = lyrics.replace('\u2005',' ').replace('\n',' ').replace('\u205f'," ")
    lyrics = re.sub("\d\dEmbedShare URLCopyEmbedCopy", "", lyrics)
    lyrics = re.sub("\d\*embedshare.*$", "", lyrics)
    lyrics = lyrics.lower() # coerce data to lower case
    tokens = wordpunct_tokenize(lyrics) # tokenize individual words
    tokens = [tok for tok in tokens if tok.isalnum()] # removing punctuation
    tokens = [tok for tok in tokens if tok not in sw] # removing stop words
    tokens = [tok for tok in tokens if tok not in sw_special] # removing special stop words found in lyrics
    tokens = [wn.lemmatize(tok) for tok in tokens] # lematizing lyrics - reducing to base words
    return " ".join(tokens)

In [None]:
# Apply function to dataframe

Combined_Lyrics['Lyrics'] = Combined_Lyrics['Lyrics'].apply(lambda x: clean_data(x))

In [7]:
# Export cleaned data to csv

Combined_Lyrics.to_csv('Combined_Lyrics.csv', index = False)