In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import English
import numpy as np



In [5]:
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f838fb96f00>

In [6]:
text_corpus = """
Google celebrated British illustrator and artist Sir John Tenniel's 
200th birth anniversary with a doodle on February 28. An acclaimed 
Victorian painter, Tenniel is celebrated for his illustrations for 
Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass.
Tenniel was born in Bayswater, West London in 1820. At the age of 20, Tenniel 
received a major eye injury and eventually, lost sight in his right eye. 
From a very early age, Tenniel was appreciated as a humorist and soon after, 
also cultured his talent for scholarly caricature.
His first illustration was for Samuel Carter Hall's The Book of British 
Ballads in 1842. Eight years later, he joined the historic weekly magazine 
Punch as a political cartoonist. Lewis Carroll noticed Tenniel's distinct style 
of work and in 1864, approached the artist to illustrate his book, Alice's 
Adventures in Wonderland. This association marked Carroll and Tenniel's creative 
partnership and continued with Through the Looking Glass in 1872. "The result: 
a series of classic characters, such as Alice and the Cheshire Cat, as depicted 
in the Doodle artwork's rendition of their iconic meeting-characters who, along 
with many others, remain beloved by readers of all ages to this day," the Google 
Doodle page says. After working with Lewis Carroll, Tenniel resumed his work with 
Punch. For his work, Tenniel also received a knighthood in 1893.
Sir John Tenniel died on February 25, 1914. He was 93.
"""

In [8]:
doc = nlp(text_corpus.replace("\n", ""))

In [9]:
doc[0]

Google

In [10]:
doc

Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28. An acclaimed Victorian painter, Tenniel is celebrated for his illustrations for Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass.Tenniel was born in Bayswater, West London in 1820. At the age of 20, Tenniel received a major eye injury and eventually, lost sight in his right eye. From a very early age, Tenniel was appreciated as a humorist and soon after, also cultured his talent for scholarly caricature.His first illustration was for Samuel Carter Hall's The Book of British Ballads in 1842. Eight years later, he joined the historic weekly magazine Punch as a political cartoonist. Lewis Carroll noticed Tenniel's distinct style of work and in 1864, approached the artist to illustrate his book, Alice's Adventures in Wonderland. This association marked Carroll and Tenniel's creative partnership and continued with Through the Looking Glass i

In [12]:
sentences = [sent.text.strip() for sent in doc.sents]

In [13]:
sentences[0]

"Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28."

# Sentence Organizer

In [14]:
sentence_organizer = {k:v for v,k in enumerate(sentences)}

In [15]:
sentence_organizer

{"Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28.": 0,
 "An acclaimed Victorian painter, Tenniel is celebrated for his illustrations for Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass.": 1,
 'Tenniel was born in Bayswater, West London in 1820.': 2,
 'At the age of 20, Tenniel received a major eye injury and eventually, lost sight in his right eye.': 3,
 'From a very early age, Tenniel was appreciated as a humorist and soon after, also cultured his talent for scholarly caricature.': 4,
 "His first illustration was for Samuel Carter Hall's The Book of British Ballads in 1842.": 5,
 'Eight years later, he joined the historic weekly magazine Punch as a political cartoonist.': 6,
 "Lewis Carroll noticed Tenniel's distinct style of work and in 1864, approached the artist to illustrate his book, Alice's Adventures in Wonderland.": 7,
 'This association marked Carroll and Tenniel\'s creati

# TF-IDF Model

In [16]:
tf_idf_vectorizer = TfidfVectorizer(min_df=2,  max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                    ngram_range=(1, 3), use_idf=1,smooth_idf=1, sublinear_tf=1, stop_words = 'english')

In [17]:
# Passing our sentences treating each as one document to TF-IDF vectorizer
tf_idf_vectorizer.fit(sentences)

TfidfVectorizer(min_df=2, ngram_range=(1, 3), smooth_idf=1,
                stop_words='english', strip_accents='unicode', sublinear_tf=1,
                token_pattern='\\w{1,}', use_idf=1)

In [18]:
# get sent-vec from tf-idf
sentence_vectors = tf_idf_vectorizer.transform(sentences)

In [21]:
print(sentence_vectors)

  (0, 32)	0.2507402576660486
  (0, 30)	0.14875395133014957
  (0, 29)	0.2827602606496029
  (0, 28)	0.2827602606496029
  (0, 27)	0.2827602606496029
  (0, 24)	0.18845309793231244
  (0, 17)	0.2827602606496029
  (0, 16)	0.2827602606496029
  (0, 15)	0.2827602606496029
  (0, 13)	0.2827602606496029
  (0, 12)	0.2827602606496029
  (0, 11)	0.2827602606496029
  (0, 8)	0.2827602606496029
  (0, 6)	0.2827602606496029
  (1, 33)	0.25361240272598884
  (1, 30)	0.1334199046399026
  (1, 26)	0.25361240272598884
  (1, 25)	0.25361240272598884
  (1, 24)	0.2861871357471958
  (1, 21)	0.25361240272598884
  (1, 20)	0.25361240272598884
  (1, 19)	0.22489312699291228
  (1, 18)	0.22489312699291228
  (1, 14)	0.25361240272598884
  (1, 11)	0.25361240272598884
  :	:
  (8, 20)	0.40338427600850774
  (8, 14)	0.40338427600850774
  (8, 10)	0.40338427600850774
  (8, 9)	0.32227288761467976
  (9, 24)	0.29519952227167706
  (9, 15)	0.44292555960612673
  (9, 12)	0.7499381624450494
  (9, 3)	0.392768307283971
  (10, 34)	0.386129614502

# Sentence Scoring

In [22]:
# Getting sentence scores for each sentences
sentence_scores = np.array(sentence_vectors.sum(axis=1)).ravel()

In [24]:
sentence_scores

array([3.69831017, 4.08663958, 1.        , 1.94791376, 1.35058657,
       1.70557256, 1.        , 4.08573981, 2.77457248, 1.88083155,
       2.60622417, 3.15900587, 0.        ])

In [25]:
sentence_scores.shape

(13,)

# Getting Top-N Sentences

In [26]:
# Getting top-n sentences
N = 3
top_n_sentences = [sentences[ind] for ind in np.argsort(sentence_scores, axis=0)[::-1][:N]]

In [27]:
top_n_sentences

["An acclaimed Victorian painter, Tenniel is celebrated for his illustrations for Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass.",
 "Lewis Carroll noticed Tenniel's distinct style of work and in 1864, approached the artist to illustrate his book, Alice's Adventures in Wonderland.",
 "Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28."]

# Performing Sentence Ordering

In [131]:
mapped_top_n_sentences = [(sentence,sentence_organizer[sentence]) for sentence in top_n_sentences]
print("Our top_n_sentence with their index: \n")
for element in mapped_top_n_sentences:
    print(element)

Our top_n_sentence with their index: 

("An acclaimed Victorian painter, Tenniel is celebrated for his illustrations for Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass.", 1)
("Lewis Carroll noticed Tenniel's distinct style of work and in 1864, approached the artist to illustrate his book, Alice's Adventures in Wonderland.", 7)
("Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28.", 0)


In [132]:
# Ordering our top-n sentences in their original ordering
mapped_top_n_sentences = sorted(mapped_top_n_sentences, key = lambda x: x[1])
ordered_scored_sentences = [element[0] for element in mapped_top_n_sentences]

In [133]:
# Our final summary
summary = " ".join(ordered_scored_sentences)
summary

"Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28. An acclaimed Victorian painter, Tenniel is celebrated for his illustrations for Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass. Lewis Carroll noticed Tenniel's distinct style of work and in 1864, approached the artist to illustrate his book, Alice's Adventures in Wonderland."

# TF - IDF

In [136]:
sentences

["Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28.",
 "An acclaimed Victorian painter, Tenniel is celebrated for his illustrations for Lewis Carroll's Alice's Adventures in Wonderland and Through the Looking-Glass.",
 'Tenniel was born in Bayswater, West London in 1820.',
 'At the age of 20, Tenniel received a major eye injury and eventually, lost sight in his right eye.',
 'From a very early age, Tenniel was appreciated as a humorist and soon after, also cultured his talent for scholarly caricature.',
 "His first illustration was for Samuel Carter Hall's The Book of British Ballads in 1842.",
 'Eight years later, he joined the historic weekly magazine Punch as a political cartoonist.',
 "Lewis Carroll noticed Tenniel's distinct style of work and in 1864, approached the artist to illustrate his book, Alice's Adventures in Wonderland.",
 'This association marked Carroll and Tenniel\'s creative partnership and conti

**`splitting sentence in to words`**

In [137]:
words_in_sentences = []
for j in range(0, len(sentences)):
    words_in_sentences.append(sentences[j].split(" "))

In [138]:
sentences[0]

"Google celebrated British illustrator and artist Sir John Tenniel's 200th birth anniversary with a doodle on February 28."

In [139]:
print(words_in_sentences[0])

['Google', 'celebrated', 'British', 'illustrator', 'and', 'artist', 'Sir', 'John', "Tenniel's", '200th', 'birth', 'anniversary', 'with', 'a', 'doodle', 'on', 'February', '28.']


In [140]:
flat_list = [item for sublist in words_in_sentences for item in sublist]
unq = list(set(flat_list))

In [145]:
len(unq)

154

**counting words in sentence**

In [141]:
from collections import Counter
import pandas as pd

count_words_in_sentences = []
for j in range(0, len(sentences)):
    count_words_in_sentences.append(Counter(words_in_sentences[j]))

In [142]:
print(count_words_in_sentences[0])

Counter({'Google': 1, 'celebrated': 1, 'British': 1, 'illustrator': 1, 'and': 1, 'artist': 1, 'Sir': 1, 'John': 1, "Tenniel's": 1, '200th': 1, 'birth': 1, 'anniversary': 1, 'with': 1, 'a': 1, 'doodle': 1, 'on': 1, 'February': 1, '28.': 1})


**Normalizing word counts**

In [143]:
normalized_word_counts = []
for j in range(0, len(sentences)):
    normalized_word_counts.append([count_words_in_sentences[j].get(i, 0)/len(count_words_in_sentences[j]) for i in unq])

In [144]:
rows = []

for j in range(len(sentences)):
    row = pd.DataFrame([normalized_word_counts[j]], columns=unq)
    rows.append(row)

tf = pd.concat(rows)
tf.head()

Unnamed: 0,such,humorist,Doodle,says.,born,200th,At,magazine,artwork's,talent,...,early,distinct,association,Glass,28.,Ballads,right,for,After,This
0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0
0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0


In [146]:
# calculate nos of documents where the particular word was present

x = tf[tf > 0].count()
x

such        1
humorist    1
Doodle      1
says.       1
born        1
           ..
Ballads     1
right       1
for         3
After       1
This        1
Length: 154, dtype: int64

In [147]:
N = len(sentences)

N/x

such        13.000000
humorist    13.000000
Doodle      13.000000
says.       13.000000
born        13.000000
              ...    
Ballads     13.000000
right       13.000000
for          4.333333
After       13.000000
This        13.000000
Length: 154, dtype: float64

In [148]:
np.log(N/x)

such        2.564949
humorist    2.564949
Doodle      2.564949
says.       2.564949
born        2.564949
              ...   
Ballads     2.564949
right       2.564949
for         1.466337
After       2.564949
This        2.564949
Length: 154, dtype: float64

In [149]:
xx = pd.DataFrame(np.log(N/x)).T
xx

Unnamed: 0,such,humorist,Doodle,says.,born,200th,At,magazine,artwork's,talent,...,early,distinct,association,Glass,28.,Ballads,right,for,After,This
0,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,1.466337,2.564949,2.564949


In [150]:
idf = xx.loc[xx.index.repeat(N)]
idf.head()

Unnamed: 0,such,humorist,Doodle,says.,born,200th,At,magazine,artwork's,talent,...,early,distinct,association,Glass,28.,Ballads,right,for,After,This
0,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,1.466337,2.564949,2.564949
0,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,1.466337,2.564949,2.564949
0,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,1.466337,2.564949,2.564949
0,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,1.466337,2.564949,2.564949
0,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,1.466337,2.564949,2.564949


In [151]:
tf_idf = tf*idf.values
tf_idf.head()

Unnamed: 0,such,humorist,Doodle,says.,born,200th,At,magazine,artwork's,talent,...,early,distinct,association,Glass,28.,Ballads,right,for,After,This
0,0.0,0.0,0.0,0.0,0.0,0.142497,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.142497,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146634,0.0,0.0
0,0.0,0.0,0.0,0.0,0.320619,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.134997,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.134997,0.0,0.0,0.0
0,0.0,0.128247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128247,...,0.128247,0.0,0.0,0.0,0.0,0.0,0.0,0.073317,0.0,0.0


In [152]:
tf_idf.loc[:, "score"] = tf_idf.values.sum(axis=1)
tf_idf.head()

Unnamed: 0,such,humorist,Doodle,says.,born,200th,At,magazine,artwork's,talent,...,distinct,association,Glass,28.,Ballads,right,for,After,This,score
0,0.0,0.0,0.0,0.0,0.0,0.142497,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.142497,0.0,0.0,0.0,0.0,0.0,1.949693
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.146634,0.0,0.0,1.919744
0,0.0,0.0,0.0,0.0,0.320619,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.968451
0,0.0,0.0,0.0,0.0,0.0,0.0,0.134997,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.134997,0.0,0.0,0.0,1.866432
0,0.0,0.128247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.073317,0.0,0.0,2.023716


In [153]:
tf_idf['index'] = list(range(0, len(sentences)))

In [154]:
tf_idf.sort_values(['score'], ascending= False).head()

Unnamed: 0,such,humorist,Doodle,says.,born,200th,At,magazine,artwork's,talent,...,association,Glass,28.,Ballads,right,for,After,This,score,index
0,0.065768,0.0,0.131536,0.065768,0.0,0.0,0.0,0.0,0.065768,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.42073,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183211,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.230511,6
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.102851,12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.170997,0.0,0.097756,0.0,0.0,2.07582,5
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.150879,0.150879,0.0,0.0,0.0,0.0,0.0,0.150879,2.031463,8
