In [8]:
import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer #detokanize the list of tokens back into a single string

In [9]:
nltk.download('punkt') #downloads the punkt module used for tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
df = pd.read_csv('bbc_text.csv') #load the data

In [11]:
df.head() #displays first five rows

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [12]:
labels = set(df['labels'])
labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [13]:
label = 'business' #we choose only of the labels we want to train from

In [14]:
texts = df[df['labels'] == label]['text']
texts.head() #choose text from rows that match the label of 'business'

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [15]:
probs = {} #key:(w(t-1)=prev word, w(t+1))=follow word; value: {w(t): count(w(t)) all possible middle words}
                                                       #value is a nested dictionary containing all possible middle words
for doc in texts: #used for collecting counts
  lines = doc.split("\n")
  for line in lines:
    tokens = word_tokenize(line)#tokenize the lines, we get a list of tokens
    for i in range(len(tokens) - 2):#iterating trigrams, so three tokens taken together
      t_0 = tokens[i]
      t_1 = tokens[i + 1] #middle word
      t_2 = tokens[i + 2]
      key = (t_0, t_2)
      if key not in probs:
        probs[key] = {}# create a new entry
      
      # add count for middle token
      if t_1 not in probs[key]:
        probs[key][t_1] = 1
      else:
        probs[key][t_1] += 1

In [16]:
# normalize probabilities
for key, d in probs.items(): #key=context words,d=prob. for the middle word
  # d should represent a distribution
  total = sum(d.values())
  for k, v in d.items():
    d[k] = v / total

In [37]:
#probs
#model is complete

In [18]:
texts.iloc[0].split("\n") # iloc() function in python is defined in the Pandas module that helps us to select a specific row or column from the data set.

['Ad sales boost Time Warner profit',
 '',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.',
 '',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 '',
 "Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers a

In [19]:
def spin_document(doc):
  #split the document into lines (paragraphs)
  #spin each paragraph one by one
  lines = doc.split("\n")
  output = []
  for line in lines:
    if line: #to check if line is empty or not
      new_line = spin_line(line)
    else:
      new_line = line
    output.append(new_line)
  return "\n".join(output)


In [20]:
detokenizer = TreebankWordDetokenizer() #covert list of tokens back into string
                                        #creating an instance of class treebank..
                                        #TreebankWordDetikenizer- detokenizes in such a way by putting spaces bw two token s only when req ex-bw two words 
                                        #and doesnt use space when not required ex- bw a word and punctuation                                      

In [21]:
texts.iloc[0].split("\n")[2] #pick a random sentence from one of our documents

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [22]:
detokenizer.detokenize(word_tokenize(texts.iloc[0].split("\n")[2])) #detokenized string same as the original string

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [23]:
def sample_word(d):
  p0 = np.random.random()
  cumulative = 0
  for t, p in d.items():
    cumulative += p
    if p0 < cumulative:
      return t
  assert(False) # should never get here

In [24]:
def spin_line(line):
  tokens = word_tokenize(line)
  i = 0
  output = [tokens[0]]
  while i < (len(tokens) - 2):
    t_0 = tokens[i]
    t_1 = tokens[i + 1]
    t_2 = tokens[i + 2]
    key = (t_0, t_2)
    p_dist = probs[key]
    if len(p_dist) > 1 and np.random.random() < 0.3:
      # let's replace the middle word
      middle = sample_word(p_dist)
      #output.append(t_1)
      #output.append("<" + middle + ">")
      output.append( middle )
      output.append(t_2)

      # we won't replace the 3rd token since the middle
      # token was dependent on it
      # instead, skip ahead 2 steps
      i += 2
    else:
      # we won't replace this middle word
      output.append(t_1)
      i += 1
  # append the final token - only if there was no replacement
  if i == len(tokens) - 2:
    output.append(tokens[-1])
  return detokenizer.detokenize(output)

In [25]:
np.random.seed(1)

In [26]:
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]
new_doc = spin_document(doc)

In [27]:
doc

'Deutsche Telekom sees mobile gain\n\nGerman telecoms firm Deutsche Telekom saw strong fourth quarter profits on the back of upbeat US mobile earnings and better-than-expected asset sales.\n\nNet profit came in at 1.4bn euros (£960m; $1.85bn), a dramatic change from the loss of 364m euros in 2003. Sales rose 2.8% to 14.96bn euros. Sales of stakes in firms including Russia\'s OAO Mobile Telesystems raised 1.17bn euros. This was more than expected and helped to bring debt down to 35.8bn euros.\n\nA year ago, debt was more than 11bn euros higher. T-Mobile USA, the company\'s American mobile business, made a strong contribution to profits. "It\'s a seminal achievement that they cut debt so low. That gives them some head room to invest in growth now," said Hannes Wittig, telecoms analyst at Dresdner Kleinwort Wasserstein. The company also said it would resume paying a dividend, after two years in which it focused on cutting debt.'

In [28]:
print(textwrap.fill(
    new_doc, replace_whitespace=False, fix_sentence_endings=True))

Deutsche Telekom sees mobile gain

German telecoms firm Deutsche
Telekom saw strong fourth quarter profits saved the order of upbeat US
mobile earnings and better-than-expected asset sales.

Net profit came
in at 1.4bn euros (£960m; $1.85bn), a dramatic change, the loss of
364m euros in 2003 . Sales rose 2.8% to 14.96bn euros . Sales of falls
in firms including Russia's OAO Mobile Telesystems raised 1.17bn euros
. This was more than expected and helped to bring debt down to 43.9bn
euros.

A year 2050, debt was larger than 11bn euros). T-Mobile USA,
the company's American mobile business has made a profitable
contribution to build . "That's a seminal achievement that they cut
debt so low . That gives them some head room to invest in growth now,"
said Hannes Wittig, telecoms analyst at Dresdner Kleinwort Wasserstein
. The company has said it would be paying a dividend, after two years
in which it focused on cutting debt.


In [29]:
##EVALUATION CRITERIA

#BLUE SCORE- The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric for evaluating a 
#generated sentence to a reference sentence. A perfect match results in a score of 1.0, whereas a perfect mismatch results in a score of 0.0.
#The primary programming task for a BLEU implementor is to compare n-grams of the candidate with the n-grams of the reference translation 
#and count the number of matches.
lines = doc.split("\n")
for line in lines:
  tokens=word_tokenize(line)

lines2 = new_doc.split("\n")
for line in lines2:
  tokens2=word_tokenize(line)

print(tokens)
print(tokens2)

# n-gram individual BLEU
from nltk.translate.bleu_score import sentence_bleu
print(' 1-gram BLUE score: %f' % sentence_bleu(tokens, tokens2, weights=(1, 0, 0, 0)))
print(' 2-gram BLUE score: %f' % sentence_bleu(tokens, tokens2, weights=(0, 1, 0, 0)))
print(' 3-gram BLUE score: %f' % sentence_bleu(tokens, tokens2, weights=(0, 0, 1, 0)))
print(' 4-gram BLUE score: %f' % sentence_bleu(tokens, tokens2, weights=(0, 0, 0, 1)))

['A', 'year', 'ago', ',', 'debt', 'was', 'more', 'than', '11bn', 'euros', 'higher', '.', 'T-Mobile', 'USA', ',', 'the', 'company', "'s", 'American', 'mobile', 'business', ',', 'made', 'a', 'strong', 'contribution', 'to', 'profits', '.', '``', 'It', "'s", 'a', 'seminal', 'achievement', 'that', 'they', 'cut', 'debt', 'so', 'low', '.', 'That', 'gives', 'them', 'some', 'head', 'room', 'to', 'invest', 'in', 'growth', 'now', ',', "''", 'said', 'Hannes', 'Wittig', ',', 'telecoms', 'analyst', 'at', 'Dresdner', 'Kleinwort', 'Wasserstein', '.', 'The', 'company', 'also', 'said', 'it', 'would', 'resume', 'paying', 'a', 'dividend', ',', 'after', 'two', 'years', 'in', 'which', 'it', 'focused', 'on', 'cutting', 'debt', '.']
['A', 'year', '2050', ',', 'debt', 'was', 'larger', 'than', '11bn', 'euros', ')', '.', 'T-Mobile', 'USA', ',', 'the', 'company', "'s", 'American', 'mobile', 'business', 'has', 'made', 'a', 'profitable', 'contribution', 'to', 'build', '.', '``', 'That', "'s", 'a', 'seminal', 'achie

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [30]:
## Comparing with QUILL BOT

#text generated by quill bot for the smae doc
new_doc2='''Deutsche Telekom predicts an increase in mobile''

Deutsche Telekom, a German telecommunications company, reported good fourth quarter profits because to encouraging US mobile earnings and better-than-expected asset sales.


Net profit was 1.4 billion euros ($1.85 billion), a significant improvement above the deficit of 364 million euros in 2003. 14.96 billion euros in sales up 2.8%. 1.17 billion euros were collected through the sale of stock in companies including Russia's OAO Mobile Telesystems. This exceeded expectations and reduced debt to 35.8 billion euros.

Debt was over 11 billion euros higher a year ago. The company's American mobile division, T-Mobile USA, significantly increased profits. "That they reduced debt to such a low level is a landmark achievement. They now have some breathing room to invest in growth thanks to it'''


In [31]:
print(new_doc2)

Deutsche Telekom predicts an increase in mobile''

Deutsche Telekom, a German telecommunications company, reported good fourth quarter profits because to encouraging US mobile earnings and better-than-expected asset sales.


Net profit was 1.4 billion euros ($1.85 billion), a significant improvement above the deficit of 364 million euros in 2003. 14.96 billion euros in sales up 2.8%. 1.17 billion euros were collected through the sale of stock in companies including Russia's OAO Mobile Telesystems. This exceeded expectations and reduced debt to 35.8 billion euros.

Debt was over 11 billion euros higher a year ago. The company's American mobile division, T-Mobile USA, significantly increased profits. "That they reduced debt to such a low level is a landmark achievement. They now have some breathing room to invest in growth thanks to it


In [32]:
#comparing quillbot and article spinner
lines3 = new_doc2.split("\n")
for line in lines3:
  tokens3=word_tokenize(line)

#print(tokens)
print(tokens3)

# n-gram individual BLEU
from nltk.translate.bleu_score import sentence_bleu
print(' 1-gram BLUE score: %f' % sentence_bleu(tokens, tokens3, weights=(1, 0, 0, 0)))
print(' 2-gram BLUE score: %f' % sentence_bleu(tokens, tokens3, weights=(0, 1, 0, 0)))
print(' 3-gram BLUE score: %f' % sentence_bleu(tokens, tokens3, weights=(0, 0, 1, 0)))
print(' 4-gram BLUE score: %f' % sentence_bleu(tokens, tokens3, weights=(0, 0, 0, 1)))


['Debt', 'was', 'over', '11', 'billion', 'euros', 'higher', 'a', 'year', 'ago', '.', 'The', 'company', "'s", 'American', 'mobile', 'division', ',', 'T-Mobile', 'USA', ',', 'significantly', 'increased', 'profits', '.', '``', 'That', 'they', 'reduced', 'debt', 'to', 'such', 'a', 'low', 'level', 'is', 'a', 'landmark', 'achievement', '.', 'They', 'now', 'have', 'some', 'breathing', 'room', 'to', 'invest', 'in', 'growth', 'thanks', 'to', 'it']
 1-gram BLUE score: 0.075472
 2-gram BLUE score: 0.000000
 3-gram BLUE score: 0.000000
 4-gram BLUE score: 0.000000
