In [12]:
#Imports
import re
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from itertools import islice
#nltk.download()

In [13]:
#Loading the data
df = pd.read_excel('next_stop_is_vietnam.xlsx')

#Seeing it
df.head()

Unnamed: 0,CD,Song,Artist,Year,Lyrics,Notes
0,1,Where Have All The Flowers Gone,The Kingston Trio,1962,"Where have all the flowers gone, long time pas...",
1,1,Soldier's Plea,Marvin Gaye,1962,"While I'm away, darling\nHow often do you thin...",
2,1,Letter to a Buddie,Joe Medwick,1963,[speech],
3,1,Soldiers Who Want To Be Heroes,Rod McKuen,1971,Soldiers who wanna be heroes\nNumber practical...,
4,1,The Cruel War,"Peter, Paul & Mary",1962,"The Cruel War is raging, Johnny has to fight\n...",


In [14]:
#Creating a copy with only the lyrics
df2 = df.copy()
df2.Lyrics = df2.Lyrics.replace('-', '')
df2 = df2.dropna(subset=['Lyrics'])

df2.shape

(127, 6)

## Step 1: Working on the entire corpus
We will count frequencies on the entire corpus in order to have an idea on what the most common words are.

In [58]:
#Punctuation and numbers to be removed
punctuation = re.compile(r'[-.?!,":;()|0-9]')

#Creating list of all lower case words
word_lists = [re.split('\s+', x) for x in df2.Lyrics.str.lower()]

full_word_list = [item for sublist in word_lists for item in sublist]

#Removing Punctuation
words = (punctuation.sub("", word).strip() for word in full_word_list)

#Removing stopwords
words = (word for word in words if word not in stopwords.words('english'))

In [59]:
# create dictionary of word:frequency pairs
frequencies = Counter(words)


#bigrams = (Counter(zip(re.findall("\w+", x), islice(re.findall("\w+", x), 1, None))) for x in df2.Lyrics)

In [60]:
display(pd.DataFrame(frequencies.most_common(20)))

Unnamed: 0,0,1
0,vietnam,145
1,war,136
2,home,117
3,i'm,108
4,one,107
5,oh,94
6,,91
7,know,88
8,bring,88
9,got,84


## Step 2: Fine graining to song level
Now that we looked at the entire document, we must work at song level to detect specificities

In [9]:
#Counting the bigrams
df2['Bigram_freq'] = [Counter(zip(re.findall("\w+", x), islice(re.findall("\w+", x), 1, None))) for x in df2.Lyrics]

#Making a list of words
df2['Word_list'] = [item for item in [re.split('\s+', x) for x in df2.Lyrics.str.lower()]]


#Removing punctuation
df2['Word_list'] = [[punctuation.sub("", word).strip() for word in x] for x in df2.Word_list]

#Removing stemmed words
df2['Word_list'] = [w for w in df2.Word_list if w not in stopwords.words('english')]

#Counting frequencies
df2['Frequencies'] = [Counter(x) for x in df2.Word_list]

In [10]:
df2.head()

Unnamed: 0,CD,Song,Artist,Year,Lyrics,Notes,Bigram_freq,Word_list,Frequencies
0,1,Where Have All The Flowers Gone,The Kingston Trio,1962,"Where have all the flowers gone, long time pas...",,"{('Where', 'have'): 13, ('have', 'all'): 15, (...","[where, have, all, the, flowers, gone, long, t...","{'where': 15, 'have': 15, 'all': 15, 'the': 15..."
1,1,Soldier's Plea,Marvin Gaye,1962,"While I'm away, darling\nHow often do you thin...",,"{('While', 'I'): 1, ('I', 'm'): 3, ('m', 'away...","[while, i'm, away, darling, how, often, do, yo...","{'while': 1, 'i'm': 3, 'away': 1, 'darling': 2..."
2,1,Letter to a Buddie,Joe Medwick,1963,[speech],,{},[[speech]],{'[speech]': 1}
3,1,Soldiers Who Want To Be Heroes,Rod McKuen,1971,Soldiers who wanna be heroes\nNumber practical...,,"{('Soldiers', 'who'): 11, ('who', 'wanna'): 11...","[soldiers, who, wanna, be, heroes, number, pra...","{'soldiers': 11, 'who': 22, 'wanna': 22, 'be':..."
4,1,The Cruel War,"Peter, Paul & Mary",1962,"The Cruel War is raging, Johnny has to fight\n...",,"{('The', 'Cruel'): 1, ('Cruel', 'War'): 1, ('W...","[the, cruel, war, is, raging, johnny, has, to,...","{'the': 2, 'cruel': 1, 'war': 1, 'is': 3, 'rag..."


## Step 3: Grouping by CD
Because each CD of the corpus has a specific theme, we will group the lyrics per CD in order to conduct a more meaningfull analysis.