In [110]:
#Imports
import re
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from itertools import islice
#nltk.download()

In [111]:
#Loading the data
df = pd.read_excel('next_stop_is_vietnam.xlsx')

#Seeing it
df.head()

Unnamed: 0,CD,CD_Title,Song,Artist,Year,Lyrics,Notes
0,1,Mr. Where Is Viet Nam,,The Kingston Trio,1962,"Where have all the flowers gone, long time pas...",
1,1,Mr. Where Is Viet Nam,Soldier's Plea,Marvin Gaye,1962,"While I'm away, darling\nHow often do you thin...",
2,1,Mr. Where Is Viet Nam,Letter to a Buddie,Joe Medwick,1963,[speech],
3,1,Mr. Where Is Viet Nam,Soldiers Who Want To Be Heroes,Rod McKuen,1971,Soldiers who wanna be heroes\nNumber practical...,
4,1,Mr. Where Is Viet Nam,The Cruel War,"Peter, Paul & Mary",1962,"The Cruel War is raging, Johnny has to fight\n...",


In [112]:
#Creating a copy with only the lyrics
df2 = df.copy()
df2.Lyrics = df2.Lyrics.replace('-', '')
df2 = df2.dropna(subset=['Lyrics'])

df2.shape

(127, 7)

## Step 1: Working on the entire corpus
We will count frequencies on the entire corpus in order to have an idea on what the most common words are.

In [113]:
#Punctuation and numbers to be removed
punctuation = re.compile(r'[-.?!,":;()|0-9]')

#Creating list of all lower case words
word_lists = [re.split('\s+', x) for x in df2.Lyrics.str.lower()]

full_word_list = [item for sublist in word_lists for item in sublist]

#Removing Punctuation
words = (punctuation.sub("", word).strip() for word in full_word_list)

#Removing stopwords
words = (word for word in words if word not in stopwords.words('english'))

In [114]:
# create dictionary of word:frequency pairs
frequencies = Counter(words)


#bigrams = (Counter(zip(re.findall("\w+", x), islice(re.findall("\w+", x), 1, None))) for x in df2.Lyrics)

In [147]:
display(pd.DataFrame(frequencies.most_common(20)))

Unnamed: 0,0,1
0,vietnam,145
1,war,136
2,home,117
3,i'm,108
4,one,107
5,oh,94
6,,91
7,know,88
8,bring,88
9,got,84


## Step 2: Fine graining to song level
Now that we looked at the entire document, we must work at song level to detect specificities

In [116]:
#Counting the bigrams
df2['Bigram_freq'] = [Counter(zip(re.findall("\w+", x), islice(re.findall("\w+", x), 1, None))) for x in df2.Lyrics]

#Making a list of words
df2['Word_list'] = [item for item in [re.split('\s+', x) for x in df2.Lyrics.str.lower()]]


#Removing punctuation
df2['Word_list'] = [[punctuation.sub("", word).strip() for word in x] for x in df2.Word_list]

#Removing stopwords 
df2['Word_list'] = [w for w in df2.Word_list if w not in stopwords.words('english')]

#Counting frequencies
df2['Frequencies'] = [Counter(x) for x in df2.Word_list]

In [146]:
df2

Unnamed: 0,CD,CD_Title,Song,Artist,Year,Lyrics,Notes,Bigram_freq,Word_list,Frequencies
0,1,Mr. Where Is Viet Nam,,The Kingston Trio,1962,"Where have all the flowers gone, long time pas...",,"{('Where', 'have'): 13, ('have', 'all'): 15, (...","[where, have, all, the, flowers, gone, long, t...","{'where': 15, 'have': 15, 'all': 15, 'the': 15..."
1,1,Mr. Where Is Viet Nam,Soldier's Plea,Marvin Gaye,1962,"While I'm away, darling\nHow often do you thin...",,"{('While', 'I'): 1, ('I', 'm'): 3, ('m', 'away...","[while, i'm, away, darling, how, often, do, yo...","{'while': 1, 'i'm': 3, 'away': 1, 'darling': 2..."
2,1,Mr. Where Is Viet Nam,Letter to a Buddie,Joe Medwick,1963,[speech],,{},[[speech]],{'[speech]': 1}
3,1,Mr. Where Is Viet Nam,Soldiers Who Want To Be Heroes,Rod McKuen,1971,Soldiers who wanna be heroes\nNumber practical...,,"{('Soldiers', 'who'): 11, ('who', 'wanna'): 11...","[soldiers, who, wanna, be, heroes, number, pra...","{'soldiers': 11, 'who': 22, 'wanna': 22, 'be':..."
4,1,Mr. Where Is Viet Nam,The Cruel War,"Peter, Paul & Mary",1962,"The Cruel War is raging, Johnny has to fight\n...",,"{('The', 'Cruel'): 1, ('Cruel', 'War'): 1, ('W...","[the, cruel, war, is, raging, johnny, has, to,...","{'the': 2, 'cruel': 1, 'war': 1, 'is': 3, 'rag..."
6,1,Mr. Where Is Viet Nam,"President Eisenhower speaks (January 17, 1961)",-,1961,,,{},[],{'': 1}
7,1,Mr. Where Is Viet Nam,Masters Of War,Bob Dylan,1963,Masters of war\nBuild to destroy\nYou play wit...,,"{('Masters', 'of'): 3, ('of', 'war'): 3, ('war...","[masters, of, war, build, to, destroy, you, pl...","{'masters': 3, 'of': 6, 'war': 4, 'build': 3, ..."
8,1,Mr. Where Is Viet Nam,Distant Drums,Jim Reeves,1966,"I hear the sound of distant drums\nFar away, f...",,"{('I', 'hear'): 2, ('hear', 'the'): 2, ('the',...","[i, hear, the, sound, of, distant, drums, far,...","{'i': 4, 'hear': 2, 'the': 7, 'sound': 2, 'of'..."
9,1,Mr. Where Is Viet Nam,"Senator Wayne Morse speaks (August 2, 1964)",-,1964,,,{},[],{'': 1}
10,1,Mr. Where Is Viet Nam,There's A War,Morty Gunty,-,,Seems to have been released with the compilati...,{},[],{'': 1}


## Step 3: Grouping by CD
Because each CD of the corpus has a specific theme, we will group the lyrics per CD in order to conduct a more meaningfull analysis.

In [139]:
#Removing non-alphabetical characters
pattern = re.compile('([^\s\w]|_)+')
df = df[df['Lyrics'].apply(lambda x: type(x)==str)]
df = df[df['Lyrics'].apply(lambda x: len(x)>5)]


groupedCD_df = df.groupby(["CD_Title", "CD"])['Lyrics'].apply(' '.join).reset_index()
groupedCD_df = groupedCD_df.sort_values('CD')

In [140]:
display(groupedCD_df)

Unnamed: 0,CD_Title,CD,Lyrics
6,Mr. Where Is Viet Nam,1,"Where have all the flowers gone, long time pas..."
8,Proud To Serve,2,"In nineteen hundred and sixty three, I was a s..."
1,Greeting (Uncle Sam Wants You),3,"Vietnam Vietnam, everybody cryin' about Vietna..."
2,Hell No-We Won't Go,4,Oh I marched to the battle of New Orleans\nAt ...
5,"It's America, Love It Or Leave It",5,"My dearest mama, they just gave us time to wri..."
12,War Is Hell,6,Wait until the war is over \nAnd we're both a ...
3,In Country' Voices,7,"Saigon's a strange city, on the river Mekong s..."
7,Peace Now,8,Silent soldiers on a silver screen\nFramed in ...
9,Seaching For Closure,9,Returning home from Vietnam\nI know back home ...
0,Beyond The Wall,10,"Detroit to d.C., night train, capitol, parts e..."


In [142]:
#Making a list of words
groupedCD_df['Word_list'] = [item for item in [re.split('\s+', x) for x in groupedCD_df.Lyrics.str.lower()]]


#Removing punctuation
groupedCD_df['Word_list'] = [[punctuation.sub("", word).strip() for word in x] for x in groupedCD_df.Word_list]

#Removing stopwords 
groupedCD_df['Word_list'] = [[x for x in Stringlist if x not in stopwords.words('english')] for Stringlist in groupedCD_df.Word_list]

#Counting frequencies
groupedCD_df['Frequencies'] = [Counter(x) for x in groupedCD_df.Word_list]

In [145]:
for i in groupedCD_df.index: 
    print('Most common words for CD {}: {}' .format(groupedCD_df.CD[i], groupedCD_df.CD_Title[i]))
    display(pd.DataFrame(groupedCD_df.Frequencies[i].most_common(20)))

Most common words for CD 1: Mr. Where Is Viet Nam


Unnamed: 0,0,1
0,gone,22
1,wanna,22
2,war,22
3,learn,21
4,boy,20
5,never,18
6,we're,18
7,i'm,17
8,school,17
9,learned,17


Most common words for CD 2: Proud To Serve


Unnamed: 0,0,1
0,men,30
1,sky,14
2,never,14
3,one,11
4,proud,11
5,us,10
6,died,10
7,gallant,10
8,private,10
9,white,10


Most common words for CD 3: Greeting (Uncle Sam Wants You)


Unnamed: 0,0,1
0,vietnam,24
1,i'm,22
2,oh,20
3,need,16
4,go,16
5,right,15
6,baby,14
7,got,12
8,man,11
9,wanna,9


Most common words for CD 4: Hell No-We Won't Go


Unnamed: 0,0,1
0,kill,63
1,war,57
2,ain't,43
3,people,34
4,bring,30
5,big,26
6,let,26
7,home,26
8,there's,24
9,peace,23


Most common words for CD 5: It's America, Love It Or Leave It


Unnamed: 0,0,1
0,wish,20
1,old,16
2,we're,15
3,man,15
4,way,15
5,america,15
6,men,13
7,i'm,13
8,said,12
9,little,12


Most common words for CD 6: War Is Hell


Unnamed: 0,0,1
0,vietnam,48
1,war,12
2,susan,12
3,west,11
4,waiting,11
5,better,10
6,coast,10
7,andy,10
8,fighting,10
9,run,10


Most common words for CD 7: In Country' Voices


Unnamed: 0,0,1
0,bamiba,39
1,green,21
2,jolly,18
3,back,15
4,i'll,14
5,wah,14
6,whoo,14
7,go,12
8,got,11
9,know,9


Most common words for CD 8: Peace Now


Unnamed: 0,0,1
0,peace,28
1,chance,28
2,one,20
3,saying,20
4,give,20
5,war,17
6,xmas,12
7,oh,11
8,there's,10
9,come,10


Most common words for CD 9: Seaching For Closure


Unnamed: 0,0,1
0,kids,23
1,napalm,22
2,sticks,19
3,one,17
4,home,15
5,i'm,14
6,last,14
7,long,12
8,americans,10
9,i'd,9


Most common words for CD 10: Beyond The Wall


Unnamed: 0,0,1
0,i've,16
1,he's,15
2,got,15
3,line,14
4,vietnam,14
5,,12
6,wall,10
7,see,9
8,time,9
9,walking,9


Most common words for CD 11: In The Rear View Mirror


Unnamed: 0,0,1
0,home,50
1,bring,49
2,'em,48
3,love,19
4,give,16
5,wings,15
6,us,14
7,reason,11
8,,10
9,left,10


Most common words for CD 12: Vets Look Back


Unnamed: 0,0,1
0,hero,12
1,dead,9
2,mourn,8
3,land,8
4,free,8
5,want,8
6,follow,8
7,,7
8,lawdy,6
9,i'm,5


Most common words for CD 13: Vietnam's After Effects


Unnamed: 0,0,1
0,got,7
1,vietnam,6
2,didn’t,6
3,even,6
4,oh,5
5,killed,5
6,know,5
7,soon,4
8,told,4
9,doctors,4
