In [5]:
#題目: 將某篇文章以上下文相同，比方三連詞(trigram)方式修改內容
#說明：某篇文章中我們可以找出所有的三連詞(trigram)，以及在前字與後字出現時，
#按照出現度隨機選出一個字去換掉中間字，這是利用三連詞修改文章內容的最基本作法。
#一旦字典的資料結構建立，我們就以某種機率(比方20%)去置換原文，並將置換文與原文印出來

#延伸: 可用五連詞或七連詞去取代中間字，可利用三連詞之前兩字去更換第三字，
#可增加加詞性的相同性(Parts Of Sentence)提高可讀性，甚至使用 Word2Vec, Glove，或者RNN的

#範例程式檔名: article_modifier_自動文件修改器.py。
#模組: sklearn, random, numpy, nltk, bs4
#輸入檔：./electronics/positive.review
#成績：被置換文的合理性與可讀性


# 使用三連詞 trigrams 練習簡易文件產生器
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range



import nltk
import random
import numpy as np

from bs4 import BeautifulSoup


# load the reviews
positive_reviews = BeautifulSoup(open('./datasets/electronics/positive.review', encoding='ISO-8859-1').read(), "lxml")
positive_reviews = positive_reviews.findAll('review_text')


# 提出 三連詞 並置入字典
# (w1, w3) 當作 key, [ w2 ] 當作值
trigrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        k = (tokens[i], tokens[i+2])
        if k not in trigrams:
            trigrams[k] = []
        trigrams[k].append(tokens[i+1])

# 將中間字矩陣變成或然率向量
trigram_probabilities = {}
for k, words in iteritems(trigrams):
    # 產生一個  word -> count 字典
    if len(set(words)) > 1:
        # 如果中間字middle word不只有一個機率 
        d = {}
        n = 0
        for w in words:
            if w not in d:
                d[w] = 0
            d[w] += 1
            n += 1
        for w, c in iteritems(d):
            d[w] = float(c) / n
        trigram_probabilities[k] = d


def random_sample(d):
    # 從字典隨機選出一個帶機率值的樣本，回傳累積機率值最大的字
    r = random.random()
    cumulative = 0
    for w, p in iteritems(d):
        cumulative += p
        if r < cumulative:
            return w


def test_spinner():
    review = random.choice(positive_reviews)
    s = review.text.lower()
    print("Original:", s)
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        if random.random() < 0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                tokens[i+1] = w
    print("Spun:")
    print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))


if __name__ == '__main__':
    test_spinner()


Original: 
i bought usb rudder pedals and yoke for use with x-plane on a mac dual g5.
plug and play (no drivers needed), easily configured in x-plane, works beautifully.
great price! thanks amazon! shipping was 2 days early! kudos to fedex!
i am a private pilot, and recommend this product

Spun:
i bought usb rudder pedals and yoke for use with x-plane on a mac dual g5. plug and toslink ( no drivers needed ), easily configured in x-plane, works beautifully. great price! thanks amazon! shipping was 2 days early! and to fedex! i am a private pilot, and recommend this product


In [23]:
# 讀取資料

# positive_reviews = BeautifulSoup(open('./datasets/electronics/positive.review', encoding='ISO-8859-1').read(), "lxml")
with open('./datasets/electronics/positive.review', encoding='ISO-8859-1') as f:
    # print(f.read())    # 發現是html檔案，透過Beautiful解析。
    html_doc = f.read()
    data = BeautifulSoup(html_doc, 'html.parser')
    positive_review = data.findAll('review_text')
    print(positive_review[0])
    print('去除tag', '-'*20)
    print(positive_review[0].text)

<review_text>
I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.

I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.

As always, Amazon had it to me in &lt;2 business days
</review_text>
去除tag --------------------

I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.

I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregula

In [24]:
# 透過字典去儲存三連詞關係。
# (前, 後): {中: 次數}
import nltk

trigrams = {}
for review in positive_review:
    review = review.text
    # 斷詞
    tokenized_word = nltk.word_tokenize(review)
    for i in range(len(tokenized_word)-2):
        prev_word = tokenized_word[i]
        mid_word = tokenized_word[i+1]
        post_word = tokenized_word[i+2]
        if (prev_word, post_word) not in trigrams:
            trigrams[(prev_word, post_word)] = {mid_word: 1}
        else:
            trigrams[(prev_word, post_word)][mid_word] = trigrams[(prev_word, post_word)].get(mid_word, 0) + 1

In [25]:
# 將機率也新增進去

for key in trigrams:
    count = 0
    for word in trigrams[key]:
        count += trigrams[key][word]
    for word in trigrams[key]:
        prob = trigrams[key][word] / count
        trigrams[key][word] = (trigrams[key][word], prob)


In [26]:
trigrams

{('I', 'this'): {'purchased': (19, 0.12418300653594772),
  'bought': (46, 0.3006535947712418),
  'recomend': (1, 0.006535947712418301),
  'made': (2, 0.013071895424836602),
  'picked': (3, 0.0196078431372549),
  'say': (1, 0.006535947712418301),
  'use': (9, 0.058823529411764705),
  'had': (1, 0.006535947712418301),
  'got': (9, 0.058823529411764705),
  'think': (6, 0.0392156862745098),
  'ordered': (2, 0.013071895424836602),
  'matched': (1, 0.006535947712418301),
  'noticed': (1, 0.006535947712418301),
  'thought': (1, 0.006535947712418301),
  'recommend': (4, 0.026143790849673203),
  'choose': (1, 0.006535947712418301),
  'like': (2, 0.013071895424836602),
  'found': (9, 0.058823529411764705),
  'did': (2, 0.013071895424836602),
  'find': (4, 0.026143790849673203),
  'set': (1, 0.006535947712418301),
  'love': (4, 0.026143790849673203),
  'hold': (1, 0.006535947712418301),
  'received': (2, 0.013071895424836602),
  'have': (3, 0.0196078431372549),
  'buy': (1, 0.006535947712418301),

In [27]:
# 透過random.choices去透過機率選擇替換詞。

import random

for i in range(10):
    print(random.choices([True, False], [0.2, 0.8]))

[False]
[True]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]


In [51]:
def change_text(review, trigrams, p=0.2):
    print('原先: ', review)
    
    tokenized_word = nltk.word_tokenize(review)
    new_review = [tokenized_word[0]]
    for i in range(len(tokenized_word)-2):
        prev_word = tokenized_word[i]
        mid_word = tokenized_word[i+1]
        post_word = tokenized_word[i+2]
        
        is_sub = random.choices([True, False], [p, 1-p])
        if not is_sub:
            new_review.append(mid_word)
        else:
            choices = list(trigrams[(prev_word, post_word)].keys())
            probs = [trigrams[(prev_word, post_word)][key][1] for key in trigrams[(prev_word, post_word)]]
            word = random.choices(choices, probs)[0]
            new_review.append(word)
    new_review.append(tokenized_word[-1])
    return ' '.join(new_review)


change_text(positive_review[0].text, trigrams)

原先:  
I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.

I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.

As always, Amazon had it to me in <2 business days



'I ordered this unit due to frequent blackouts in this MP500 and crank power supplies going strong . It will run my wireless included , etc. , DSL , and LCD monitor for several minutes . It gets greater than expected time to do work and shut down . Equally important , i found if my ears are receiving clean music . I feel that this mouse is minor compared to the balance of valuable data when the end of equipment due to a power spike or an irregular power interruptions . I always , I had more took use in < 2 business days'