# Bag of words = counting words

In [11]:
import numpy as np
import pandas as pd

'''
# sparse matrix -> a table where most values are zero
# here the long format is more efficient
data = [[1.0, np.NaN], [np.NaN, 4.0]]

df = pd.DataFrame(data)
df.stack()

# lets count words
s1 = "She loves you yeah, yeah, yeah!"
s2 = "I just call to say: 'I love you!'"

#
# tokenization
#

# remove special characters
t1 = s1.replace("!", "").replace(",","")

SPECIAL_CHARS = """!'",?:"""
t2 = s2
for spc in SPECIAL_CHARS:
    t2 = t2.replace(spc, "")

# remove case
t1 = t1.lower()
t2 = t2.lower()

# stemming -> for proper stemming, use NLTK or Spacy
t1 = t1.replace("loves", "love")

token1 = t1.split()
token2 = t2.split()

# remove stop words
STOPWORDS = ["to", "just", "is"]

# slow
for stop in STOPWORDS:
    if stop in token2:
        token2.remove(stop)

# Python set + comprehension -> much faster
STOPWORDS = {"to", "just", "is"}
token2 = [word for word in token2 if word not in STOPWORDS]

#
# count words (bag of words)
#

# with pandas
data = {'beatles': token1, 'stevie': token2}
df = pd.DataFrame(data)

c1 = df['beatles'].value_counts()
c2 = df['stevie'].value_counts()

df = pd.DataFrame({'beatles': c1, 'stevie': c2})
df.stack()

# with a Python dictionary
d = {}
for word in token1:
    if word not in d:
        d[word] = 1
    else:
        d[word] += 1

# with a Counter dict it is much easier
from collections import Counter
d = Counter(token1)
d.most_common(3)


#
# parsing lyrics
#
html = open('money40.html').read()
len(html)

# with string functions
begin = "Sorry about that. -->"
end = "MxM banner"
start = html.find(begin)
stop = html.find(end)

lyrics1 = html[start:stop]

# Problem: .find might return -1 but slicing still works:
html[-1:-1]

# better: with RegEx
import re

lyrics = re.findall("Sorry about that\. -->(.+)MxM banner", html, re.DOTALL)
# lyrics is a list of strings
if len(lyrics) == 1:
    lyrics = lyrics[0]
else:
    print("something went wrong")

# remove special chars and tags
l2 = re.sub("\.|\,|\?|\n|<br>", " ", lyrics)
l2 = re.sub("<i>|</i>|\–|<div>|</div>", " ", l2)
l2 = re.sub("\s", " ", l2) # replace tabs and newlines

l2 = l2.lower()
tokens = l2.split()
c = Counter(tokens)
c.most_common(10)

'''

'\n# sparse matrix -> a table where most values are zero\n# here the long format is more efficient\ndata = [[1.0, np.NaN], [np.NaN, 4.0]]\n\ndf = pd.DataFrame(data)\ndf.stack()\n\n# lets count words\ns1 = "She loves you yeah, yeah, yeah!"\ns2 = "I just call to say: \'I love you!\'"\n\n#\n# tokenization\n#\n\n# remove special characters\nt1 = s1.replace("!", "").replace(",","")\n\nSPECIAL_CHARS = """!\'",?:"""\nt2 = s2\nfor spc in SPECIAL_CHARS:\n    t2 = t2.replace(spc, "")\n\n# remove case\nt1 = t1.lower()\nt2 = t2.lower()\n\n# stemming -> for proper stemming, use NLTK or Spacy\nt1 = t1.replace("loves", "love")\n\ntoken1 = t1.split()\ntoken2 = t2.split()\n\n# remove stop words\nSTOPWORDS = ["to", "just", "is"]\n\n# slow\nfor stop in STOPWORDS:\n    if stop in token2:\n        token2.remove(stop)\n\n# Python set + comprehension -> much faster\nSTOPWORDS = {"to", "just", "is"}\ntoken2 = [word for word in token2 if word not in STOPWORDS]\n\n#\n# count words (bag of words)\n#\n\n# with pa

In [13]:
# sparse matrix -> a table where most values are zero
# here the long format is more efficient

data = [[1.0, np.NaN], [np.NaN, 4.0]]

df = pd.DataFrame(data)
df

Unnamed: 0,0,1
0,1.0,
1,,4.0


In [14]:
df.stack()

0  0    1.0
1  1    4.0
dtype: float64

In [15]:
# lets count words
s1 = "She loves you yeah, yeah, yeah!"
s2 = "I just call to say: 'I love you!'"


In [None]:
#
# tokenization
#

In [17]:
# first possibility

# remove special characters
t1 = s1.replace("!", "").replace(",","")
t1

'She loves you yeah yeah yeah'

In [19]:
# Second possibility

SPECIAL_CHARS = """!'",?:"""
t2 = s2
for spc in SPECIAL_CHARS:
    t2 = t2.replace(spc, "")

t2    

'I just call to say I love you'

In [22]:
# remove case
t1 = t1.lower()
t1

'she loves you yeah yeah yeah'

In [24]:
# remove case
t2 = t2.lower()
t2

'i just call to say i love you'

In [25]:
# stemming -> for proper stemming, use NLTK or Spacy

t1 = t1.replace("loves", "love")
t1

'she love you yeah yeah yeah'

In [27]:
token1 = t1.split()
token1

['she', 'love', 'you', 'yeah', 'yeah', 'yeah']

In [37]:
token2 = t2.split()
token2

['i', 'just', 'call', 'to', 'say', 'i', 'love', 'you']

In [38]:
# remove stop words
STOPWORDS = ["to", "just", "is"]

In [39]:
# slow
for stop in STOPWORDS:
    if stop in token2:
        token2.remove(stop)

In [41]:
# Python set + comprehension -> much faster
STOPWORDS = {"to", "just", "is"}
token2 = [word for word in token2 if word not in STOPWORDS]

In [None]:
#
# count words (bag of words)
#

In [42]:
# with pandas
data = {'beatles': token1, 'stevie': token2}
df = pd.DataFrame(data)

In [43]:
c1 = df['beatles'].value_counts()
c1

yeah    3
love    1
you     1
she     1
Name: beatles, dtype: int64

In [44]:
c2 = df['stevie'].value_counts()
c2

i       2
call    1
love    1
say     1
you     1
Name: stevie, dtype: int64

In [45]:
df = pd.DataFrame({'beatles': c1, 'stevie': c2})
df

Unnamed: 0,beatles,stevie
call,,1.0
i,,2.0
love,1.0,1.0
say,,1.0
she,1.0,
yeah,3.0,
you,1.0,1.0


In [46]:
df.stack()

call  stevie     1.0
i     stevie     2.0
love  beatles    1.0
      stevie     1.0
say   stevie     1.0
she   beatles    1.0
yeah  beatles    3.0
you   beatles    1.0
      stevie     1.0
dtype: float64

In [47]:
# with a Python dictionary
d = {}
for word in token1:
    if word not in d:
        d[word] = 1
    else:
        d[word] += 1
        
d

{'she': 1, 'love': 1, 'you': 1, 'yeah': 3}

In [48]:
d = {}
for word in token2:
    if word not in d:
        d[word] = 1
    else:
        d[word] += 1
        
d

{'i': 2, 'call': 1, 'say': 1, 'love': 1, 'you': 1}

In [51]:
# with a Counter dict it is much easier

from collections import Counter

d = Counter(token1)
d.most_common(3)

[('yeah', 3), ('she', 1), ('love', 1)]

In [52]:
d = Counter(token2)
d.most_common(3)

[('i', 2), ('call', 1), ('say', 1)]

In [None]:
#
# parsing lyrics
#

In [53]:
html = open('C://Users/Pippo/Desktop/python_examples/Week_04/lyrics_Def_Leppard-20th_Century_Boy.txt', encoding="utf-8").read()
len(html)

158278

In [82]:
# with string functions

begin = 'data-lang="en">'
end = "</pre>"
start = html.find(begin)
stop = html.find(end)

In [83]:
lyrics1 = html[start:stop]

In [84]:
import re

lyrics = re.findall('data-lang="en">(.+)</pre>', html, re.DOTALL)
# lyrics is a list of strings

In [85]:
lyrics

['Friends say it\'s fine\n\nFriends say it\'s good\n\nEverybody says it\'s just like <a style="color:#333;" href="https://www.definitions.net/definition/Robin">Robin</a> Hood\n\nI walk like a rat\n\nCrawl like a cat\n\nSting like a bee\n\nBabe I\'m <a style="color:#333;" href="https://www.definitions.net/definition/gonna">gonna</a> be your man\n\n\n\nAnd it\'s <a style="color:#333;" href="https://www.definitions.net/definition/plain">plain</a> to see\n\nYou were <a style="color:#333;" href="https://www.definitions.net/definition/meant">meant</a> for me\n\nYeah, I\'m your toy\n\nYour 20th <a style="color:#333;" href="https://www.definitions.net/definition/Century">Century</a> boy\n\n\n\nFriends say it\'s fine\n\nFriends say it\'s good\n\nEverybody says it\'s just like <a style="color:#333;" href="https://www.definitions.net/definition/Robin">Robin</a> Hood\n\nFly like a plane\n\nDrive like a car\n\nBall like a hound\n\nBabe I\'m <a style="color:#333;" href="https://www.definitions.net/d

In [90]:
if len(lyrics) == 1:
    lyrics = lyrics[0]
else:
    print("something went wrong")

something went wrong


In [99]:
#remove special chars and tags
l2 = re.sub("\.|\,|\?|\n|<br>", " ", lyrics)
l2 = re.sub("<i>|</i>|\–|<div>|</div>", " ", l2)
l2 = re.sub("\s", " ", l2) # replace tabs and newlines
l2

'Friends say it\'s fine  Friends say it\'s good  Everybody says it\'s just like <a style="color:#333;" href="https://www definitions net/definition/Robin">Robin</a> Hood  I walk like a rat  Crawl like a cat  Sting like a bee  Babe I\'m <a style="color:#333;" href="https://www definitions net/definition/gonna">gonna</a> be your man    And it\'s <a style="color:#333;" href="https://www definitions net/definition/plain">plain</a> to see  You were <a style="color:#333;" href="https://www definitions net/definition/meant">meant</a> for me  Yeah  I\'m your toy  Your 20th <a style="color:#333;" href="https://www definitions net/definition/Century">Century</a> boy    Friends say it\'s fine  Friends say it\'s good  Everybody says it\'s just like <a style="color:#333;" href="https://www definitions net/definition/Robin">Robin</a> Hood  Fly like a plane  Drive like a car  Ball like a hound  Babe I\'m <a style="color:#333;" href="https://www definitions net/definition/gonna">gonna</a> be your man 

In [116]:
# l2 = re.sub("<a[^>]*>(.*?)</a>", "", l2)
l2 = re.sub("<[^>]+>", "", l2)
l2

"friends say it's fine  friends say it's good  everybody says it's just like   hood  i walk like a rat  crawl like a cat  sting like a bee  babe i'm   be your man    and it's   to see  you were   for me  yeah  i'm your toy  your 20th   boy    friends say it's fine  friends say it's good  everybody says it's just like   hood  fly like a plane  drive like a car  ball like a hound  babe i'm   be your man    and it's   to see  you were   for me  yeah  i'm your toy  your 20th   boy  20th   boy  i   be your toy [4x]    friends say it's fine  friends say it's good  everybody says it's just like   hood  i walk like a rat  crawl like a cat  sting like a bee  babe i'm   be your man    and it's   to see  you were   for me  yeah i'm your toy  your 20th   boy  20th   boy  i   be your toy [repeat x4]"

In [110]:
l2 = l2.lower()
l2

"friends say it's fine  friends say it's good  everybody says it's just like   hood  i walk like a rat  crawl like a cat  sting like a bee  babe i'm   be your man    and it's   to see  you were   for me  yeah  i'm your toy  your 20th   boy    friends say it's fine  friends say it's good  everybody says it's just like   hood  fly like a plane  drive like a car  ball like a hound  babe i'm   be your man    and it's   to see  you were   for me  yeah  i'm your toy  your 20th   boy  20th   boy  i   be your toy [4x]    friends say it's fine  friends say it's good  everybody says it's just like   hood  i walk like a rat  crawl like a cat  sting like a bee  babe i'm   be your man    and it's   to see  you were   for me  yeah i'm your toy  your 20th   boy  20th   boy  i   be your toy [repeat x4]"

In [112]:
tokens = l2.split()
tokens

['friends',
 'say',
 "it's",
 'fine',
 'friends',
 'say',
 "it's",
 'good',
 'everybody',
 'says',
 "it's",
 'just',
 'like',
 'hood',
 'i',
 'walk',
 'like',
 'a',
 'rat',
 'crawl',
 'like',
 'a',
 'cat',
 'sting',
 'like',
 'a',
 'bee',
 'babe',
 "i'm",
 'be',
 'your',
 'man',
 'and',
 "it's",
 'to',
 'see',
 'you',
 'were',
 'for',
 'me',
 'yeah',
 "i'm",
 'your',
 'toy',
 'your',
 '20th',
 'boy',
 'friends',
 'say',
 "it's",
 'fine',
 'friends',
 'say',
 "it's",
 'good',
 'everybody',
 'says',
 "it's",
 'just',
 'like',
 'hood',
 'fly',
 'like',
 'a',
 'plane',
 'drive',
 'like',
 'a',
 'car',
 'ball',
 'like',
 'a',
 'hound',
 'babe',
 "i'm",
 'be',
 'your',
 'man',
 'and',
 "it's",
 'to',
 'see',
 'you',
 'were',
 'for',
 'me',
 'yeah',
 "i'm",
 'your',
 'toy',
 'your',
 '20th',
 'boy',
 '20th',
 'boy',
 'i',
 'be',
 'your',
 'toy',
 '[4x]',
 'friends',
 'say',
 "it's",
 'fine',
 'friends',
 'say',
 "it's",
 'good',
 'everybody',
 'says',
 "it's",
 'just',
 'like',
 'hood',
 'i',

In [114]:
c = Counter(tokens)
c

Counter({'friends': 6,
         'say': 6,
         "it's": 12,
         'fine': 3,
         'good': 3,
         'everybody': 3,
         'says': 3,
         'just': 3,
         'like': 12,
         'hood': 3,
         'i': 4,
         'walk': 2,
         'a': 9,
         'rat': 2,
         'crawl': 2,
         'cat': 2,
         'sting': 2,
         'bee': 2,
         'babe': 3,
         "i'm": 6,
         'be': 5,
         'your': 11,
         'man': 3,
         'and': 3,
         'to': 3,
         'see': 3,
         'you': 3,
         'were': 3,
         'for': 3,
         'me': 3,
         'yeah': 3,
         'toy': 5,
         '20th': 5,
         'boy': 5,
         'fly': 1,
         'plane': 1,
         'drive': 1,
         'car': 1,
         'ball': 1,
         'hound': 1,
         '[4x]': 1,
         '[repeat': 1,
         'x4]': 1})

In [115]:
c.most_common(10)

[("it's", 12),
 ('like', 12),
 ('your', 11),
 ('a', 9),
 ('friends', 6),
 ('say', 6),
 ("i'm", 6),
 ('be', 5),
 ('toy', 5),
 ('20th', 5)]