## 作業目標：搭建一個bag of words模型

In [1]:
import pandas as pd
import nltk
#nltk.download()
import numpy as np

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)
corpus = dataset['Review'].values
corpus[0:4]

array(['Wow... Loved this place.', 'Crust is not good.',
       'Not tasty and the texture was just nasty.',
       'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.'],
      dtype=object)

### 從文本中取出所有單字

In [3]:
whole_words = []
for sentence in corpus:
    for word in nltk.word_tokenize(sentence):
        whole_words.append(word)
whole_words[0:10]

['Wow', '...', 'Loved', 'this', 'place', '.', 'Crust', 'is', 'not', 'good']

### 移除重複單字

In [4]:
whole_words = set(whole_words)

In [5]:
print('共有{}個單字'.format(len(whole_words)))

共有2351個單字


### 建立字典使每一個單字有對應數值

In [6]:
word_index = {}
index_word = {}
n = 0
for word in whole_words:
    word_index[word] = n
    index_word[n] = word
    n += 1  

In [7]:
word_index

{'mistake': 0,
 'all': 1,
 'waste': 2,
 'trap': 3,
 'hurry': 4,
 'why': 5,
 'cakes': 6,
 '...': 7,
 'soup': 8,
 'lacked': 9,
 'over-whelm': 10,
 'drive': 11,
 'burgers': 12,
 'particular': 13,
 'typical': 14,
 'steak': 15,
 'shocked': 16,
 'Owner': 17,
 'Google': 18,
 'FANTASTIC': 19,
 'pork': 20,
 'cocktails': 21,
 'meal': 22,
 'below': 23,
 'Ramsey': 24,
 'avoided': 25,
 'relocated': 26,
 'oven': 27,
 'Japanese': 28,
 'honestly': 29,
 'ohhh': 30,
 'handling': 31,
 'Cod': 32,
 'Coffee': 33,
 'PERFECT': 34,
 'highlighted': 35,
 'whatsoever': 36,
 'send': 37,
 'buffet': 38,
 'you': 39,
 'focused': 40,
 'disaster': 41,
 'Ample': 42,
 'authentic': 43,
 'dine': 44,
 'thrilled': 45,
 'Restaurant': 46,
 'tacos': 47,
 'disappointing': 48,
 'visit': 49,
 'drawing': 50,
 'loving': 51,
 'special': 52,
 'count': 53,
 'Favorite': 54,
 'IT': 55,
 'nice': 56,
 'experiencing': 57,
 'reason': 58,
 'yay': 59,
 'bar': 60,
 'cute': 61,
 'my': 62,
 'attitudes': 63,
 'pretty': 64,
 'dark': 65,
 'third': 66

In [8]:
index_word

{0: 'mistake',
 1: 'all',
 2: 'waste',
 3: 'trap',
 4: 'hurry',
 5: 'why',
 6: 'cakes',
 7: '...',
 8: 'soup',
 9: 'lacked',
 10: 'over-whelm',
 11: 'drive',
 12: 'burgers',
 13: 'particular',
 14: 'typical',
 15: 'steak',
 16: 'shocked',
 17: 'Owner',
 18: 'Google',
 19: 'FANTASTIC',
 20: 'pork',
 21: 'cocktails',
 22: 'meal',
 23: 'below',
 24: 'Ramsey',
 25: 'avoided',
 26: 'relocated',
 27: 'oven',
 28: 'Japanese',
 29: 'honestly',
 30: 'ohhh',
 31: 'handling',
 32: 'Cod',
 33: 'Coffee',
 34: 'PERFECT',
 35: 'highlighted',
 36: 'whatsoever',
 37: 'send',
 38: 'buffet',
 39: 'you',
 40: 'focused',
 41: 'disaster',
 42: 'Ample',
 43: 'authentic',
 44: 'dine',
 45: 'thrilled',
 46: 'Restaurant',
 47: 'tacos',
 48: 'disappointing',
 49: 'visit',
 50: 'drawing',
 51: 'loving',
 52: 'special',
 53: 'count',
 54: 'Favorite',
 55: 'IT',
 56: 'nice',
 57: 'experiencing',
 58: 'reason',
 59: 'yay',
 60: 'bar',
 61: 'cute',
 62: 'my',
 63: 'attitudes',
 64: 'pretty',
 65: 'dark',
 66: 'third'

## 轉換句子為bag of words型式

In [9]:
def _get_bag_of_words_vector(sentence, word_index, whole_words):
    sentence = sentence
    # 先建立等同字典大小的向量，且數值皆為0
    vector = np.zeros(len(whole_words))
    # 有出現的單詞取出，找到對應的 index，將向量中這個位置的值+1
    for word in nltk.word_tokenize(sentence):
        if word in whole_words:
            vector[word_index[word]] += 1
    return vector

In [10]:
v1 = _get_bag_of_words_vector('Wow... Loved this place.', word_index, whole_words)
v1[:100]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
print(v1[word_index['Wow']])

1.0


In [17]:
corpus[1]

'Crust is not good.'

In [18]:
v2 = _get_bag_of_words_vector(corpus[1], word_index, whole_words)
v2

array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
print(v2[word_index['good']])

1.0


In [19]:
print(v2[word_index['Wow']])

0.0
