In [18]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import string
from collections import defaultdict

In [11]:
soup = BeautifulSoup(open(os.path.join('data', 'positive.review')))
review_texts = soup.findAll('review_text')
review_texts_contents = [review_text.text for review_text in review_texts]
del review_texts
print(len(review_texts_contents), "reviews found.")

1000 reviews found.


In [60]:
def tokenize(s) :
    s = s.lower()
    s = nltk.tokenize.word_tokenize(s)
    s = [token.translate(str.maketrans('', '', string.punctuation)) for token in s if token.isalpha()]
    return s

tokenizer = Tokenizer(oov_token='<OOV>', analyzer=tokenize)
tokenizer.fit_on_texts(review_texts_contents)
sequences = tokenizer.texts_to_sequences(review_texts_contents)
print(len(tokenizer.word_index), 'unique tokens found')

7386 unique tokens found


## Frequency Count

In [207]:
# Create defaultdicts for p_w0, p_wt, and p_wT
p_w0 = defaultdict(int)
p_wt = defaultdict(int)
p_wT = defaultdict(int)

for sequence in sequences:
    if len(sequence) > 1:
        first_word, second_word = sequence[:2]
        p_w0[(first_word, second_word)] += 1
        last_word, second_last_word = sequence[-1], sequence[-2]
        p_wT[(second_last_word, last_word)] += 1
    
    for t in range(1, len(sequence) - 1):
        w_prev, w, w_next = sequence[t - 1], sequence[t], sequence[t + 1]
        p_wt[(w_prev, w, w_next)] += 1

# Convert defaultdicts to pandas Series
p_w0 = pd.Series(p_w0)
p_wt = pd.Series(p_wt)
p_wT = pd.Series(p_wT)
p_w0.index.names = (["Word@(0)", "Word@(1)"])
p_wt.index.names = (["Word@(t-1)", "Word@(t)" ,"Word@(t+1)"])
p_wT.index.names = (["Word@(T-1)", "Word@(T)"])

## Smoothing

In [208]:
M = len(tokenizer.word_index.values())
p_w0 = p_w0.sort_index()
p_wt = p_wt.sort_index()
p_wT = p_wT.sort_index()

p_w0 = (p_w0)/ (p_w0.groupby(level="Word@(0)").sum())
p_wT = (p_wT)/ (p_wT.groupby(level="Word@(T)").sum())
p_wt = (p_wt) / (p_wt.groupby(level=["Word@(t-1)", "Word@(t+1)"]).sum())

display(p_w0.groupby(level=0).sum(), p_wt.groupby(level=["Word@(t-1)", "Word@(t+1)"]).sum(), p_wT.groupby(level=1).sum())

Word@(0)
2       1.0
3       1.0
5       1.0
6       1.0
7       1.0
       ... 
5322    1.0
5446    1.0
6203    1.0
6761    1.0
7314    1.0
Length: 164, dtype: float64

Word@(t-1)  Word@(t+1)
2           2             1.0
            3             1.0
            4             1.0
            5             1.0
            6             1.0
                         ... 
7381        2             1.0
7382        13            1.0
7383        16            1.0
7384        7             1.0
7385        6             1.0
Length: 61451, dtype: float64

Word@(T)
2       1.0
3       1.0
5       1.0
6       1.0
7       1.0
       ... 
7216    1.0
7230    1.0
7300    1.0
7305    1.0
7386    1.0
Length: 558, dtype: float64

## Taking log

In [209]:
log_p_w0 = np.log(p_w0)
log_p_wT = np.log(p_wT)
log_p_wt = np.log(p_wt)

In [265]:
p_wt

Word@(t-1)  Word@(t+1)  Word@(t)
2           2           4           0.016
                        8           0.008
                        9           0.008
                        10          0.008
                        14          0.016
                                    ...  
7381        2           5           1.000
7382        13          532         1.000
7383        16          634         1.000
7384        7           86          1.000
7385        6           15          1.000
Length: 87610, dtype: float64

In [298]:
def sample_word(series) :
    if series is None :
        return 1
    return np.random.choice(a = series.index, p=series.values)

def translate_2_text(sequence) :
    return tokenizer.sequences_to_texts(sequence)

def span_text(s, max_replacements = 1):
    sequence = tokenizer.texts_to_sequences([s])[0]
    sequence_spanned = sequence.copy()
    n = len(sequence) // max_replacements
    for i in range(0, len(sequence), n) :
        random_idx = np.random.choice(range(i, np.min((i + n, len(sequence)))))
        if random_idx == 0 :
            random_idx += 1
        elif random_idx == len(sequence) - 1 :
            random_idx -= 1
        ser = p_wt.loc[sequence[random_idx - 1], sequence[random_idx + 1]]
        if len(ser) > 1 :
            tmp = ser[sequence_spanned[random_idx]]
            tmp = tmp / (len(ser) - 1)
            ser += tmp
            ser[sequence_spanned[random_idx]] = 0
            
        sampled = sample_word(ser)
        print(f'Replaced {tokenizer.index_word[sequence_spanned[random_idx]]} with {tokenizer.index_word[sampled]}')
        sequence_spanned[random_idx] = sampled
        
        
    return translate_2_text([sequence_spanned])

test = sequences[np.random.choice(len(sequences))]
print(translate_2_text([test])[0])
print(span_text(translate_2_text([test])[0], 10)[0])

this item does exactly what it suppose to do it also makes the signal stronger therefore you get better reception worth the money only drawback would be there is a lot of cord hanging around
Replaced does with does
Replaced it with it
Replaced to with to
Replaced also with really
Replaced signal with signal
Replaced get with have
Replaced better with am
Replaced only with only
Replaced drawback with they
Replaced is with are
Replaced cord with cables
Replaced hanging with hanging
this item does exactly what it suppose to do it really makes the signal stronger therefore you have am reception worth the money only they would be there are a lot of cables hanging around
