In [348]:
import re, nltk
from nltk.lm import WittenBellInterpolated
import numpy as np
import plotly.graph_objects as go
from scipy.ndimage import gaussian_filter

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [349]:
f1 = open('train.txt', 'w+')
f2 = open('test.txt', 'w+')

f1.write("""
Advancements in technology have always had major impacts in medicine. The smartphone is one of the fastest growing sectors in the technology industry, and its impact in medicine has already been significant. Faster processors, improved memory, and smaller batteries in concert with highly efficient operating systems capable of advanced functions have paved the way for applications (commonly referred to as apps) that are affecting our personal and work environments. Like other industries, the field of medicine experienced the resounding effects of the smartphone. In fact, it may be among those industries where the impact has been most profound.
""")

f2.write("""
Advancements in technology have always had major impacts in medicine. The smart phone is one of the fastest growing sectors in the technology industry, and its impact in medicine has already been significant. Telemedicine is literally medicine from a distance and is synonymous with a telecommunications network for the transmission of medical information. Mobile technology has become a part of our everyday life. Mobile services are used in a wide variety of scientific area including healthcare.
""")

f1.close()
f2.close()

In [350]:
class TreebankWordTokenizer:
  """
  The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
  This is the method that is invoked by ``tokenize()``.

  This tokenizer performs the following steps:

  - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
  - treat most punctuation characters as separate tokens
  - split off commas and single quotes, when followed by whitespace
  - separate periods that appear at the end of line

      >>> from plagiarismdetector.tokenizer import TreebankWordTokenizer
      >>> s = "They'll save and invest more."
      >>> TreebankWordTokenizer().tokenize(s)
      ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
      >>> s = "hi, my name can't hello,"
      >>> TreebankWordTokenizer().tokenize(s)
      ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
  """

  def __init__(self):
      pass

  STARTING_QUOTES = [
      (re.compile(r'^\"'), r'``'),
      (re.compile(r'(``)'), r' \1 '),
      (re.compile(r'([ (\[{<])"'), r'\1 `` '),
  ]

  PUNCTUATION = [
      (re.compile(r'([:,])([^\d])'), r' \1 \2'),
      (re.compile(r'([:,])$'), r' \1 '),
      (re.compile(r'\.\.\.'), r' ... '),
      (re.compile(r'[;@#$%&]'), r' \g<0> '),
      (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '),  # Handles the final period.
      (re.compile(r'[?!]'), r' \g<0> '),
      (re.compile(r"([^'])' "), r"\1 ' "),
  ]

  PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')

  DOUBLE_DASHES = (re.compile(r'--'), r' -- ')

  ENDING_QUOTES = [
      (re.compile(r'"'), " '' "),
      (re.compile(r'(\S)(\'\')'), r'\1 \2 '),
      (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
      (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
  ]

  # Adapted from Robert MacIntyre's tokenizer.
  _contractions = [r"(?i)\b(can)(?#X)(not)\b",
                    r"(?i)\b(d)(?#X)('ye)\b",
                    r"(?i)\b(gim)(?#X)(me)\b",
                    r"(?i)\b(gon)(?#X)(na)\b",
                    r"(?i)\b(got)(?#X)(ta)\b",
                    r"(?i)\b(lem)(?#X)(me)\b",
                    r"(?i)\b(mor)(?#X)('n)\b",
                    r"(?i)\b(wan)(?#X)(na)\s",
                    r"(?i) ('t)(?#X)(is)\b",
                    r"(?i) ('t)(?#X)(was)\b"]
  CONTRACTIONS = list(map(re.compile, _contractions))

  def tokenize(self, text):
    for regexp, substitution in self.STARTING_QUOTES:
      text = regexp.sub(substitution, text)
      text = re.sub(r"\[.*\]|\{.*\}", "", text)

    for regexp, substitution in self.PUNCTUATION:
      text = regexp.sub(substitution, text)
      text = re.sub(r'[^\w\s]', "", text)


    # Handles parentheses.
    regexp, substitution = self.PARENS_BRACKETS
    text = regexp.sub(substitution, text)

    # Handles double dash.
    regexp, substitution = self.DOUBLE_DASHES
    text = regexp.sub(substitution, text)

    for regexp, substitution in self.ENDING_QUOTES:
      text = regexp.sub(substitution, text)

    return text.split()

In [351]:
tokenizer = TreebankWordTokenizer()

In [352]:
# Training data file
train_data_file = "train.txt"

# read training data
with open(train_data_file) as f:
    train_text = f.read().lower()

# apply preprocessing (remove text inside square and curly brackets and rem punc)
# train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
# train_text = re.sub(r'[^\w\s]', "", train_text)

In [353]:
def pad_sequence(sequence, n, pad_left=False, pad_right=False):
  if pad_left:
    sequence = ['']*n + sequence
  if pad_right:
    sequence = sequence + ['']*n
  return sequence

In [354]:
from itertools import islice

def generate_ngrams(sequence, min_length=1, max_length=-1):
  sequence = iter(sequence)
  # Get max_len for padding.
  if max_length == -1:
    try:
      max_length = len(sequence)
    except TypeError:
      sequence = list(sequence)
      max_length = len(sequence)

  # Pad if indicated using max_len.
  sequence = pad_sequence(sequence, max_length)

  # Sliding window to store grams.
  history = list(islice(sequence, max_length))

  # Yield ngrams from sequence.
  while history:
    for ngram_len in range(min_length, len(history) + 1):
      yield tuple(history[:ngram_len])

    # Append element to history if sequence has more items.
    try:
      history.append(next(sequence))
    except StopIteration:
      pass

    del history[0]

In [355]:
from nltk.corpus import wordnet

nltk.download('wordnet')

def load_synsets(context ,word):
  synsets = []
  if (word):
    # Lesk implementation
    context = set(context)
    if synsets is None:
        synsets = wordnet.synsets(word)

    if not synsets:
        return None

    _, wsd = max(
        (len(context.intersection(ss.definition().split())), ss) for ss in synsets
    )

    if (wsd):
      # print(wsd.name(), wsd.definition())
      synsets += [syn.name() for syn in wsd.lemmas()]

  return synsets

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [356]:
# set ngram number
n = 4

# pad the text and tokenize
training_data = pad_sequence(tokenizer.tokenize(train_text), n, pad_left=True)

print(list(training_data))

# generate ngrams
ngrams = list(generate_ngrams(training_data, max_length=n))
print("Number of ngrams:", len(ngrams))

# build ngram language models
model = WittenBellInterpolated(n)

synset_data = []
for i, item in enumerate(training_data[n-1:]):
  synsets = load_synsets(training_data[i:i+n-1], item)
  synset_data += ([item] + synsets if synsets else [item])

model.fit([ngrams], vocabulary_text=synset_data)
print(model.vocab)

['', '', '', '', 'advancements', 'in', 'technology', 'have', 'always', 'had', 'major', 'impacts', 'in', 'medicine', 'the', 'smartphone', 'is', 'one', 'of', 'the', 'fastest', 'growing', 'sectors', 'in', 'the', 'technology', 'industry', 'and', 'its', 'impact', 'in', 'medicine', 'has', 'already', 'been', 'significant', 'faster', 'processors', 'improved', 'memory', 'and', 'smaller', 'batteries', 'in', 'concert', 'with', 'highly', 'efficient', 'operating', 'systems', 'capable', 'of', 'advanced', 'functions', 'have', 'paved', 'the', 'way', 'for', 'applications', 'commonly', 'referred', 'to', 'as', 'apps', 'that', 'are', 'affecting', 'our', 'personal', 'and', 'work', 'environments', 'like', 'other', 'industries', 'the', 'field', 'of', 'medicine', 'experienced', 'the', 'resounding', 'effects', 'of', 'the', 'smartphone', 'in', 'fact', 'it', 'may', 'be', 'among', 'those', 'industries', 'where', 'the', 'impact', 'has', 'been', 'most', 'profound']
Number of ngrams: 402
<Vocabulary with cutoff=1 un

In [357]:
# testing data file
test_data_file = "test.txt"

# Read testing data
with open(test_data_file) as f:
    test_text = f.read().lower()

# Tokenize and pad the text
testing_data = pad_sequence(tokenizer.tokenize(test_text), n, pad_left=True)

print("Length of test data:", len(testing_data))


Length of test data: 80


In [358]:
# assign scores
scores = []
for i, item in enumerate(testing_data[n-1:]):
    s = model.score(item, testing_data[i:i+n-1])
    scores.append(s)

scores_np = np.array(scores)

print(scores)

# set width and height
width = 8
height = np.ceil(len(testing_data)/width).astype("int32")
print("Width, Height:", width, ",", height)

# copy scores to rectangular blank array
a = np.zeros(width*height)
a[:len(scores_np)] = scores_np
diff = len(a) - len(scores_np)

# apply gaussian smoothing for aesthetics
a = gaussian_filter(a, sigma=1.0)

# reshape to fit rectangle
a = a.reshape(-1, width)

# format labels
labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*diff
labels = [f"{x:60.60}" for x in labels]

[0.5526143790849674, 0.38398692810457513, 0.8823529411764706, 0.7749554367201426, 0.8149509803921569, 0.8137254901960784, 0.8762254901960784, 0.8762254901960784, 0.8762254901960784, 0.8823529411764706, 0.7987967914438503, 0.6764705882352942, 0.0, 0.0, 0.00980392156862745, 0.5049019607843137, 0.7598039215686274, 0.8298319327731092, 0.6428104575163398, 0.8762254901960784, 0.8762254901960784, 0.8823529411764706, 0.7816399286987522, 0.768954248366013, 0.8137254901960784, 0.8786764705882353, 0.7928921568627452, 0.8774509803921569, 0.8198529411764706, 0.7987967914438503, 0.6691176470588236, 0.8137254901960784, 0.8774509803921569, 0.8137254901960784, 0.0, 0.00980392156862745, 0.0, 0.029411764705882353, 0.0, 0.0, 0.0, 0.029411764705882353, 0.004901960784313725, 0.0, 0.00980392156862745, 0.0, 0.0, 0.0, 0.00980392156862745, 0.0392156862745098, 0.0, 0.0392156862745098, 0.0, 0.0, 0.0, 0.0196078431372549, 0.00980392156862745, 0.0, 0.0, 0.0, 0.0392156862745098, 0.004201680672268907, 0.0, 0.0, 0.0, 0

In [359]:
# create heatmap
fig = go.Figure(data=go.Heatmap(
                z=a, x0=0, dx=1,
                y=labels, zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                colorscale="burg"))
fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}})
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()