# Chapter 18
## Sentiment analysis

In [1]:
import nltk
import keras
import numpy as np
import pandas as pd
import urllib.request as urllib2
from bs4 import BeautifulSoup
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF

### Feature extraction

In [2]:
text_1 = "The quick brown fox jumps over the lazy dog"
text_2 = "My dog is quick and can jump over fences"
text_3 = "Your dog is so lazy that it sleeps all day"
text_4 = "A black dog just passed by but my dog is brown"
corpus = [text_1, text_2, text_3]

In [3]:
vectorizer = text.CountVectorizer(binary=True)
vectorized_texts = vectorizer.fit_transform(corpus)
print(vectorized_texts.todense())
print()
print(vectorizer.vocabulary_)

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 1]]

{'the': 19, 'quick': 15, 'brown': 2, 'fox': 7, 'jumps': 11, 'over': 14, 'lazy': 12, 'dog': 5, 'my': 13, 'is': 8, 'and': 1, 'can': 3, 'jump': 10, 'fences': 6, 'your': 20, 'so': 17, 'that': 18, 'it': 9, 'sleeps': 16, 'all': 0, 'day': 4}


In [4]:
corpus.append(text_4)
vectorizer = text.CountVectorizer()
vectorized_texts = vectorizer.fit_transform(corpus)
print(vectorized_texts.todense())

[[0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 1]
 [0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


In [5]:
tfidf = text.TfidfTransformer(norm='l1')
tfidf_mtx = tfidf.fit_transform(vectorized_texts)

total = 0

for i, phrase in enumerate(tfidf_mtx.toarray()):
    print(f"Text {i}")
    for word in vectorizer.vocabulary_:
        pos = vectorizer.vocabulary_[word]
        value = phrase[pos]
        if value != 0: print(f"{word:>10}: {value:.3}")
    print()

Text 0
       the: 0.261
     quick: 0.103
     brown: 0.103
       fox: 0.13
     jumps: 0.13
      over: 0.103
      lazy: 0.103
       dog: 0.068

Text 1
     quick: 0.105
      over: 0.105
       dog: 0.0693
        my: 0.105
        is: 0.0848
       and: 0.133
       can: 0.133
      jump: 0.133
    fences: 0.133

Text 2
      lazy: 0.0881
       dog: 0.0583
        is: 0.0713
      your: 0.112
        so: 0.112
      that: 0.112
        it: 0.112
    sleeps: 0.112
       all: 0.112
       day: 0.112

Text 3
     brown: 0.0955
       dog: 0.126
        my: 0.0955
        is: 0.0773
     black: 0.121
      just: 0.121
    passed: 0.121
        by: 0.121
       but: 0.121



In [6]:
bigrams = text.CountVectorizer(ngram_range=(1, 2))
print(bigrams.fit(corpus).vocabulary_)

{'the': 52, 'quick': 43, 'brown': 6, 'fox': 19, 'jumps': 29, 'over': 38, 'lazy': 33, 'dog': 15, 'the quick': 54, 'quick brown': 45, 'brown fox': 7, 'fox jumps': 20, 'jumps over': 30, 'over the': 40, 'the lazy': 53, 'lazy dog': 34, 'my': 36, 'is': 21, 'and': 2, 'can': 12, 'jump': 27, 'fences': 18, 'my dog': 37, 'dog is': 16, 'is quick': 23, 'quick and': 44, 'and can': 3, 'can jump': 13, 'jump over': 28, 'over fences': 39, 'your': 55, 'so': 48, 'that': 50, 'it': 25, 'sleeps': 46, 'all': 0, 'day': 14, 'your dog': 56, 'is so': 24, 'so lazy': 49, 'lazy that': 35, 'that it': 51, 'it sleeps': 26, 'sleeps all': 47, 'all day': 1, 'black': 4, 'just': 31, 'passed': 41, 'by': 10, 'but': 8, 'black dog': 5, 'dog just': 17, 'just passed': 32, 'passed by': 42, 'by but': 11, 'but my': 9, 'is brown': 22}


In [7]:
stemmer = nltk.stem.porter.PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')

def stem_tokens(tokens, stemmer):
    return [stemmer.stem(item) for item in tokens]

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return stem_tokens(tokens, stemmer)

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize)
vect.fit(vocab)

sentence1 = vect.transform(['George loves swimming too!'])

print(vect.get_feature_names_out())
print(sentence1.toarray())

['love' 'sam' 'swim' 'time']
[[1 0 1 0]]




### Scraping text online

In [8]:
wiki = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
header = {'User-Agent': 'Mozilla/5.0'}
query = urllib2.Request(wiki, headers=header)
page = urllib2.urlopen(query)
soup = BeautifulSoup(page, 'lxml')

In [9]:
table = soup.find("table", {"class": "wikitable sortable"})
final_table = []

def extract_txt(cell):
    cells = [c.strip() for c in cell.findAll(text=True) if '[' not in c]
    return ' '.join(cells).strip()

def filter_sq(txt):
    return txt.split('sq')[0].strip()

cols = [extract_txt(cell) for cell in table.findAll('th')]
columns = [cols[i] for i in range(1, 7)]

for row in table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells) > 0:
        final_table.append([extract_txt(cells[i]) for i in range(1, 6)] + [filter_sq(extract_txt(cells[6]))])
df = pd.DataFrame(final_table, columns=columns)

In [10]:
print(df.head(5))

         City      State 2021 estimate 2020 census       Change 2020 land area
0    New York  8,467,513     8,804,190      −3.82%  300.5 sq mi     778.3 km 2
1  California  3,849,297     3,898,747      −1.27%  469.5 sq mi   1,216.0 km 2
2    Illinois  2,696,555     2,746,388      −1.81%  227.7 sq mi     589.7 km 2
3       Texas  2,288,250     2,304,580      −0.71%  640.4 sq mi   1,658.6 km 2
4     Arizona  1,624,569     1,608,139      +1.02%  518.0 sq mi   1,341.6 km 2


### Classification

In [11]:
n = 10

shakespeare = pd.read_feather('../datasets/shakespeare_lines_in_plays.feather')
vectorizer = text.TfidfVectorizer(max_df=1., min_df=3, stop_words='english')
tfidf = vectorizer.fit_transform(shakespeare.lines)

nmf = NMF(n_components=n, max_iter=999, random_state=101)
nmf = nmf.fit(tfidf)



In [12]:
def print_topic_words(features, topics, top=5):
    for i, topic in enumerate(topics):
        words = ' '.join([features[j] for j in topic.argsort()[:-top-1:-1]])
        print(f"Topic #{i:2.0f}: {words}")
    
print_topic_words(vectorizer.get_feature_names_out(), nmf.components_, top=10)

Topic # 0: thou thy thee love ll shall come art let sir
Topic # 1: enter exeunt exit scene ii iii iv palace attendants act
Topic # 2: king thy lord warwick henry edward france york shall richard
Topic # 3: caesar antony brutus cassius shall rome cleopatra egypt casca thou
Topic # 4: antipholus dromio chain sir syracuse ephesus thou dinner home master
Topic # 5: page ford sir master mistress shall good anne come kate
Topic # 6: rome marcius titus lavinia andronicus marcus lucius coriolanus thy tribunes
Topic # 7: lord good shall sir know ll man come let love
Topic # 8: cassio iago desdemona othello moor roderigo emilia handkerchief lieutenant cyprus
Topic # 9: hector troilus achilles ajax troy agamemnon diomed cressid patroclus pandarus


In [13]:
index = shakespeare.play + ' act: ' + shakespeare.act

def print_top_match(model, data, index, topic_no):
    topic_scores = model.transform(data)[:, topic_no]
    print(index[np.argmax(topic_scores)])

print_top_match(nmf, tfidf, index, 7)

Hamlet act: 2


### Sentiment analysis

In [14]:
reviews = pd.read_feather('../datasets/imdb_50k.feather')
train = reviews.sample(30000, random_state=42)
valid = reviews[~reviews.index.isin(train.index)].sample(10000, random_state=42)
sampled_idx = train.index
sampled_idx.append(valid.index)
test = reviews[~reviews.index.isin(sampled_idx)]

reviews.review.sample(1).values[0]

'The screenplay is the worst part of this film, as it lurches from one premise to the next, missing all the important bits that would have made a number of different stories possible. (This film is confusing, because the audience doesn\'t know what the story is.) I had no problem with the low-production values and the acting wasn\'t great, but this is telly, so it was fine. I don\'t mind if some scenes looked like they were done in one take. But having such a non-sensical screenplay is completely unnecessary. Did any executive actually read it before forking out the cash? Avoid this at all costs.<br /><br />The prologue in particular was so poorly written, it needed a voice-over to fill in all the details that had been left out. The prologue was rushed, it wasn\'t clear what was happening, ie. The Russian Revolution was reduced to "Some riots are happening in Petersburg", with the next scene being soldiers arresting them. I know the basic history of the Revolution, so I could fill in t

In [15]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train.review)

def tokenize_and_pad(df, tokenizer=tokenizer, maxlen=256):
    sequences = tokenizer.texts_to_sequences(df.review)
    padded_seqs = keras.preprocessing.sequence.pad_sequences(sequences, maxlen)
    return padded_seqs, df.sentiment.values

X, y = tokenize_and_pad(train)
Xv, yv = tokenize_and_pad(valid)
Xt, yt = tokenize_and_pad(test)

In [16]:
voc = len(tokenizer.index_word) + 1
feats = 8
seq_len = 256

model = keras.models.Sequential(
    [
        keras.layers.Embedding(voc, feats, input_length=seq_len),
        keras.layers.Flatten(),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation='sigmoid')
    ]
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 256, 8)            790040    
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dropout (Dropout)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 1)                 2049      
                                                                 
Total params: 792,089
Trainable params: 792,089
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(X, y, epochs=2, batch_size=16, validation_data=(Xv, yv))
print(f"\nAccuracy on test set: {model.evaluate(Xt, yt, verbose=0)[1]:.4}")

Epoch 1/2
Epoch 2/2

Accuracy on test set: 0.8953
