In [1]:
#題目: 將某篇文章以上下文相同，比方三連詞(trigram)方式修改內容
#說明：某篇文章中我們可以找出所有的三連詞(trigram)，以及在前字與後字出現時，
#按照出現度隨機選出一個字去換掉中間字，這是利用三連詞修改文章內容的最基本作法。
#一旦字典的資料結構建立，我們就以某種機率(比方20%)去置換原文，並將置換文與原文印出來

#延伸: 可用五連詞或七連詞去取代中間字，可利用三連詞之前兩字去更換第三字，
#可增加加詞性的相同性(Parts Of Sentence)提高可讀性，甚至使用 Word2Vec, Glove，或者RNN的

#範例程式檔名: article_modifier_自動文件修改器.py。
#模組: sklearn, random, numpy, nltk, bs4
#輸入檔：./electronics/positive.review
#成績：被置換文的合理性與可讀性

# 使用三連詞 trigrams 練習簡易文件產生器
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

import nltk
import random
import numpy as np

from bs4 import BeautifulSoup

In [2]:
# load the reviews
positive_reviews = BeautifulSoup(open('positive.review', encoding='ISO-8859-1').read(), "lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [3]:
# 提出 五連詞 並置入字典
# (w1, w3) 當作 key, [ w2 ] 當作值
fivegrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 4):
        k = (tokens[i], tokens[i+4])
        #print(f'{tokens[i], tokens[i+1], tokens[i+2], tokens[i+3], tokens[i+4]}')
        if k not in fivegrams:
            fivegrams[k] = []
        fivegrams[k].append(tokens[i+2])

In [4]:
fivegrams

{('i', 'due'): ['this', 'corded'],
 ('purchased', 'to'): ['unit', 'was', 'unfortunately', 'cable'],
 ('this', 'frequent'): ['due'],
 ('unit', 'blackouts'): ['to'],
 ('due', 'in'): ['frequent', 'v7', 'the'],
 ('to', 'my'): ['blackouts',
  'one',
  'one',
  'back',
  'well',
  'good',
  ',',
  'stuff',
  'even',
  'it',
  '.',
  'unit',
  'now-a-days',
  'parts',
  'some',
  'interference',
  'it',
  'sure',
  'and',
  'all',
  'photos',
  '.',
  'message',
  'one',
  'them',
  'audio',
  'them',
  'players',
  'around',
  'better',
  'them',
  '8x10s',
  'meeting',
  'audio',
  'data',
  'room',
  'connected'],
 ('frequent', 'area'): ['in'],
 ('blackouts', 'and'): ['my'],
 ('in', '2'): ['area', 'for', '=', 'department'],
 ('my', 'power'): ['and'],
 ('area', 'supplies'): ['2'],
 ('and', 'going'): ['power'],
 ('2', 'bad'): ['supplies'],
 ('power', '.'): ['going', 'is'],
 ('supplies', 'it'): ['bad'],
 ('going', 'will'): ['.'],
 ('bad', 'run'): ['it'],
 ('.', 'my'): ['will',
  'case',
  'pr

In [5]:
# 將中間字矩陣變成或然率向量
fivegram_probabilities = {}
for k, words in iteritems(fivegrams):
    # 產生一個  word -> count 字典
    if len(set(words)) > 1:
        # 如果中間字middle word不只有一個機率 
        d = {}
        n = 0
        for w in words:
            if w not in d:
                d[w] = 0
            d[w] += 1
            n += 1
        for w, c in iteritems(d):
            d[w] = float(c) / n
        fivegram_probabilities[k] = d

In [6]:
def random_sample(d):
    # 從字典隨機選出一個帶機率值的樣本，回傳累積機率值最大的字
    r = random.random()
    cumulative = 0
    for w, p in iteritems(d):
        cumulative += p
        if r < cumulative:
            return w

In [7]:
def test_spinner():
    review = random.choice(positive_reviews)
    s = review.text.lower()
    print("Original:", s)
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 4):
        if random.random() < 0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+4])
            if k in fivegram_probabilities:
                w = random_sample(fivegram_probabilities[k])
                tokens[i+2] = w
    print("Spun:")
    print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))

In [8]:
if __name__ == '__main__':
    test_spinner()

Original: 
this is my first hotas, but i did a good bit of research.  it is very easy to set up, and the sst programming is very helpful setting the shift key.  the joystick is easy to move, which is nice in a close dogfight.  i am very glad that i bought this hotas, because i have had no problems with it.  i highly recommend this system to anyone

Spun:
this is my first hotas, but's did a good bit of research. it is very first to set it, and the sst programming is very helpful setting the a key. the joystick is easy bit move, which one me in a close dogfight. i am very glad that i bought it hotas, because i have had these problems with it. i summary recommend this system to anyone
