In [1]:
!pip install transformers



## Importing the libraries

In [104]:
import requests
from transformers import AutoTokenizer

import nltk
nltk.download('stopwords')
nltk.download('punkt')

import re
import string

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize as nltk_sent_tokenize
from nltk.tokenize import word_tokenize as nltk_word_tokenize

from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
import numpy as np
from nltk.corpus import stopwords

from scipy.spatial.distance import cosine

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akaur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akaur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Defining Function to  Fetch the information from Wikipedia

In [136]:
# https://en.wikipedia.org/w/api.php

def fetch_and_save_wiki(title):
  response = requests.get(
      'https://en.wikipedia.org/w/api.php',

      params = {
          "action": "query",
          "format" : "json",
          "titles":title,
          "prop":"extracts",
          "explaintext": True
      }
  ).json()

  page = next(iter(response["query"]["pages"].values()))
  wiki_text = page["extract"]

  return wiki_text

## Importing pre Trained tokenised model for text  tokenisation

In [106]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

## Functions for Cleaning the information and counting the tokens.

In [107]:
def clean_text(text):
  text = re.sub(r'[^A-Za-z0-9\s.\(\)[\]{\}]+' , '' , text)
  # print(text)
  text = text.lower()
  # print(text)
  text = " ".join(text.split())
  return text


def count_tokens(text):
  tokens = tokenizer.encode(text , add_special_tokens = True)
  return(len(tokens))

## Defining the Players whome information is to be summarised  and storing them.

In [137]:
soccar_player = [
    "Virender Sehwag",
    "Sachin Tendulkar",
    "Virat Kohli",
    "Suresh Raina",
    "MS Dhoni",
    "Piyush Chawla",
    "Zaheer Khan",
    "Ashish Nehra",
    "Yuvraj Singh",
    "Harbhajan Singh",
    "Praveen Kumar",
    "Gautam Gambhir",
    "Ravichandra Ashwin",
    "Munaf Patel",
    "Yusuf Pathan"
]

data = []
for player in soccar_player:
  info = fetch_and_save_wiki(player)
  tokens = tokenizer.encode(info , add_special_tokens= True , truncation = True , max_length = 30000)
  num_tokens = len(tokens)
  data.append([player , info , num_tokens])


KeyError: 'extract'

In [138]:
import pandas as pd




## Creating the dataframe 

In [139]:
df = pd.DataFrame(data , columns = ['cricket_player' , "player_information" , "num_tokens"])
df.head()

Unnamed: 0,cricket_player,player_information,num_tokens
0,Virender Sehwag,"Virender Sehwag (, born 20 October 1978) is a ...",7835
1,Sachin Tendulkar,Sachin Ramesh Tendulkar ( ; pronounced [sətɕin...,15656
2,Virat Kohli,Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈko...,6372
3,Suresh Raina,Suresh Raina (; born 27 November 1986) is an I...,2597
4,MS Dhoni,Mahendra Singh Dhoni ( ; born 7 July 1981) is ...,6065


## Cleaning the player Information

In [140]:
df['player_information_cleaned'] = df['player_information'].apply(clean_text)

## Counting the  number of words in Tokenised text

In [141]:
df["cleaned_token_player"] = df['player_information_cleaned'].apply(count_tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (7303 > 512). Running this sequence through the model will result in indexing errors


In [142]:
df.head()

Unnamed: 0,cricket_player,player_information,num_tokens,player_information_cleaned,cleaned_token_player
0,Virender Sehwag,"Virender Sehwag (, born 20 October 1978) is a ...",7835,virender sehwag ( born 20 october 1978) is a f...,7303
1,Sachin Tendulkar,Sachin Ramesh Tendulkar ( ; pronounced [sətɕin...,15656,sachin ramesh tendulkar ( pronounced [stin teu...,14066
2,Virat Kohli,Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈko...,6372,virat kohli (hindi pronunciation [ koli] born ...,5712
3,Suresh Raina,Suresh Raina (; born 27 November 1986) is an I...,2597,suresh raina ( born 27 november 1986) is an in...,2399
4,MS Dhoni,Mahendra Singh Dhoni ( ; born 7 July 1981) is ...,6065,mahendra singh dhoni ( born 7 july 1981) is an...,5653


In [143]:
df['player_information_cleaned'][0]



## Removing Stop Words

In [144]:
## Sentence tokenization

def sent_tokenize(text):
  sents = nltk_sent_tokenize(text)

  sent_filtered = []

  for s in sents:
    sent_filtered.append(s)
  return sent_filtered


def cleanup_sentences(text):
  stop_words = set(stopwords.words("english"))
  sentences = sent_tokenize(text)

  sentences_cleaned = []

  for sent in sentences:
    words = nltk_word_tokenize(sent)
    words = [w for w in words if w not in string.punctuation]
    words = [w for w in words if not w.lower() in stop_words]
    words = [w.lower() for w in words]
    words = " ".join(words)
    sentences_cleaned.append(words)

  return sentences_cleaned



In [148]:
df["cleaned_sentences"] = df['player_information_cleaned'].apply(cleanup_sentences)

In [149]:
df.head()

Unnamed: 0,cricket_player,player_information,num_tokens,player_information_cleaned,cleaned_token_player,cleaned_sentences
0,Virender Sehwag,"Virender Sehwag (, born 20 October 1978) is a ...",7835,virender sehwag ( born 20 october 1978) is a f...,7303,[virender sehwag born 20 october 1978 former i...
1,Sachin Tendulkar,Sachin Ramesh Tendulkar ( ; pronounced [sətɕin...,15656,sachin ramesh tendulkar ( pronounced [stin teu...,14066,[sachin ramesh tendulkar pronounced stin teulk...
2,Virat Kohli,Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈko...,6372,virat kohli (hindi pronunciation [ koli] born ...,5712,[virat kohli hindi pronunciation koli born 5 n...
3,Suresh Raina,Suresh Raina (; born 27 November 1986) is an I...,2597,suresh raina ( born 27 november 1986) is an in...,2399,[suresh raina born 27 november 1986 indian for...
4,MS Dhoni,Mahendra Singh Dhoni ( ; born 7 July 1981) is ...,6065,mahendra singh dhoni ( born 7 july 1981) is an...,5653,[mahendra singh dhoni born 7 july 1981 indian ...


In [150]:
## TFIDF

def get_tf_idf(sentences):
  vectorizer = CountVectorizer()
  sent_word_matrix = vectorizer.fit_transform(sentences)

  transformer = TfidfTransformer(norm = None , smooth_idf=False, sublinear_tf=False)
  tfidf = transformer.fit_transform(sent_word_matrix)
  tfidf = tfidf.toarray()

  #Calculate the Centroid
  centroid_vector = tfidf.sum(axis=0)
  centroid_vector = np.divide(centroid_vector , centroid_vector.max())

  feature_names = vectorizer.get_feature_names_out()

  relevant_vector_indices = np.where(centroid_vector > 0.3)[0]
  word_list = [feature_names[idx] for idx in relevant_vector_indices]

  return word_list

Centroicd for "early proponent thenmanager pep guardiola early august 2009 declared messi best player ever seen". = 5.78

Centroid for entire document": 3.78

90
70

In [151]:
df["most_important_words"]= df["cleaned_sentences"].apply(get_tf_idf)

In [152]:
df['cleaned_sentences'][0]

['virender sehwag born 20 october 1978 former indian cricketer represented india 1999 2013. widely regarded one destructive openers one greatest batsman era played delhi capitals ipl delhi haryana indian domestic cricket',
 'played first one day international 1999 joined indian test side 2001. april 2009 sehwag became first indian honoured wisden leading cricketer world performance 2008 subsequently becoming first player nationality retain award 2009. worked standin captain occasionally absence main captain india also worked vicecaptain indian squad',
 'former captain delhi daredevils delhi ranji team',
 'time india sehwag member team one joint winners 2002 icc champions trophy winners 2007 t20 world cup winners 2011 cricket world cup',
 '2002 icc champions trophy sehwag highest run scorer 271 runs',
 '2023 inducted icc cricket hall fame',
 'sehwag holds multiple records including highest score made indian test cricket 319 south africa m. a. chidambaram stadium chennai also fastest tri

In [153]:
df.head()

Unnamed: 0,cricket_player,player_information,num_tokens,player_information_cleaned,cleaned_token_player,cleaned_sentences,most_important_words
0,Virender Sehwag,"Virender Sehwag (, born 20 October 1978) is a ...",7835,virender sehwag ( born 20 october 1978) is a f...,7303,[virender sehwag born 20 october 1978 former i...,"[also, balls, batting, centuries, century, cri..."
1,Sachin Tendulkar,Sachin Ramesh Tendulkar ( ; pronounced [sətɕin...,15656,sachin ramesh tendulkar ( pronounced [stin teu...,14066,[sachin ramesh tendulkar pronounced stin teulk...,"[cricket, cup, first, india, indian, innings, ..."
2,Virat Kohli,Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈko...,6372,virat kohli (hindi pronunciation [ koli] born ...,5712,[virat kohli hindi pronunciation koli born 5 n...,"[2017, 2018, also, brand, captain, centuries, ..."
3,Suresh Raina,Suresh Raina (; born 27 November 1986) is an I...,2597,suresh raina ( born 27 november 1986) is an in...,2399,[suresh raina born 27 november 1986 indian for...,"[2020, also, balls, captain, cricket, cup, due..."
4,MS Dhoni,Mahendra Singh Dhoni ( ; born 7 July 1981) is ...,6065,mahendra singh dhoni ( born 7 july 1981) is an...,5653,[mahendra singh dhoni born 7 july 1981 indian ...,"[2011, captain, cricket, cup, dhoni, first, ic..."


In [154]:
df["most_important_words"][1]

['cricket',
 'cup',
 'first',
 'india',
 'indian',
 'innings',
 'match',
 'mumbai',
 'odi',
 'runs',
 'scored',
 'series',
 'team',
 'tendulkar',
 'tendulkars',
 'test',
 'world']

In [155]:
def word_vector_cache(sentences , embedding_model):
  word_vectors = dict()
  for sent in sentences:
    words = nltk_word_tokenize(sent)
    for w in words:
      word_vectors.update({w:embedding_model.wv[w]})
    return word_vectors

In [25]:
## embedding representation

def build_embedding_representation(words , word_vectors , embedding_model):
  embedding_representation = np.zeros(embedding_model.vector_size , dtype = "float32")
  word_vector_key = set(word_vectors.keys())

  count = 0
  for w in words:
    if w in word_vector_key:
      embedding_representation = embedding_representation + word_vectors[w]
      count += 1

  if count != 0:
    embedding_representation = np.divide(embedding_representation , count)

  return embedding_representation


In [157]:
def similarity(v1 , v2):
  score = 0.0
  if np.count_nonzero(v1) !=0 and np.count_nonzero(v2) !=0:
    score = ((1 - cosine(v1,v2)) + 1)/2

  return score

In [158]:
def summarize(text , embedding_model):
  raw_sentences = sent_tokenize(text)
  clean_sentences = cleanup_sentences(text)

  for i , s in enumerate(raw_sentences):
    print(i , s)

  for i , s in enumerate(clean_sentences):
    print(i , s)


  centroid_words = get_tf_idf(clean_sentences)
  print(len(centroid_words) , centroid_words)

  word_vectors = word_vector_cache(clean_sentences , embedding_model)

  # Centroid embedding representation

  centroid_vectors = build_embedding_representation(centroid_words ,word_vectors ,embedding_model )
  sentences_score = []

  for i in range(len(clean_sentences)):
    score = []
    words = clean_sentences[i].split()

    #sentence embedding representation
    sentence_vector = build_embedding_representation(words , word_vectors , embedding_model)

    # Cosine similarity
    score = similarity(sentence_vector ,centroid_vectors)
    sentences_score.append((i , raw_sentences[i] , score , sentence_vector))

    sentences_score_sort = sorted(sentences_score , key = lambda el:el[2] , reverse = True)

    for s in sentences_score_sort:
      print(s[0] , s[1] , s[2])

    count = 0
    sentence_summary = []

    for s in sentences_score_sort:
      if count > 100:
        break
      include_flag = True

      for ps in sentence_summary:
        sim = similarity(s[3] , ps[3])
        if sim > 0.95:
          include_flag = False

      if include_flag:
        sentence_summary.append(s)
        count += len(s[1].split())

      sentence_summary = sorted(sentence_summary , key = lambda el:el[0] , reverse = False)

    summary = "\n".join(s[1] for s in sentence_summary)
    print(summary)

    return summary

In [159]:
df['cleaned_sentences'] = df['cleaned_sentences'].astype(str)
sentences = [nltk.word_tokenize(sent) for sent in df["cleaned_sentences"].values]

In [160]:
model = Word2Vec(sentences , min_count = 1 , sg = 1)
df["summary"] = df["cleaned_sentences"].apply(lambda x:summarize(x , model))

10 ['century', 'cricket', 'first', 'indian', 'match', 'odi', 'runs', 'scored', 'sehwag', 'test']
0 ['sachin ramesh tendulkar pronounced stin teulk born 24 april 1973 indian former international cricketer captained indian national team', 'widely regarded one greatest batsmen history cricket', 'hailed worlds prolific batsman time alltime highest runscorer odi test cricket 18000 runs 15000 runs respectively', 'also holds record receiving player match awards international cricket', 'tendulkar member parliament rajya sabha presidential nomination 2012 2018. tendulkar took cricket age eleven made test match debut 15 november 1989 pakistan karachi age sixteen went represent mumbai domestically india internationally 24 years', '2002 halfway career wisden ranked secondgreatest test batsman time behind bradman secondgreatest odi batsman time behind viv richards', 'year tendulkar part team one jointwinners 2002 icc champions trophy', 'later career tendulkar part indian team 2011 cricket world cup

In [161]:
df.head()

Unnamed: 0,cricket_player,player_information,num_tokens,player_information_cleaned,cleaned_token_player,cleaned_sentences,most_important_words,summary
0,Virender Sehwag,"Virender Sehwag (, born 20 October 1978) is a ...",7835,virender sehwag ( born 20 october 1978) is a f...,7303,['virender sehwag born 20 october 1978 former ...,"[also, balls, batting, centuries, century, cri...",['virender sehwag born 20 october 1978 former ...
1,Sachin Tendulkar,Sachin Ramesh Tendulkar ( ; pronounced [sətɕin...,15656,sachin ramesh tendulkar ( pronounced [stin teu...,14066,['sachin ramesh tendulkar pronounced stin teul...,"[cricket, cup, first, india, indian, innings, ...",['sachin ramesh tendulkar pronounced stin teul...
2,Virat Kohli,Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈko...,6372,virat kohli (hindi pronunciation [ koli] born ...,5712,['virat kohli hindi pronunciation koli born 5 ...,"[2017, 2018, also, brand, captain, centuries, ...",['virat kohli hindi pronunciation koli born 5 ...
3,Suresh Raina,Suresh Raina (; born 27 November 1986) is an I...,2597,suresh raina ( born 27 november 1986) is an in...,2399,['suresh raina born 27 november 1986 indian fo...,"[2020, also, balls, captain, cricket, cup, due...",['suresh raina born 27 november 1986 indian fo...
4,MS Dhoni,Mahendra Singh Dhoni ( ; born 7 July 1981) is ...,6065,mahendra singh dhoni ( born 7 july 1981) is an...,5653,['mahendra singh dhoni born 7 july 1981 indian...,"[2011, captain, cricket, cup, dhoni, first, ic...",['mahendra singh dhoni born 7 july 1981 indian...


In [162]:
df["summary"][0]



In [41]:
df["summary_tokens"] = df["summary"].apply(count_tokens)

In [42]:
df.head()

Unnamed: 0,cricket_player,player_information,num_tokens,player_information_cleaned,cleaned_token_player,cleaned_sentences,most_important_words,summary,summary_tokens
0,Virender Sehwag,"Virender Sehwag (, born 20 October 1978) is a ...",7835,virender sehwag ( born 20 october 1978) is a f...,7303,['virender sehwag born 20 october 1978 former ...,"[also, balls, batting, centuries, century, cri...",['virender sehwag born 20 october 1978 former ...,5203
1,Sachin Tendulkar,Sachin Ramesh Tendulkar ( ; pronounced [sətɕin...,15656,sachin ramesh tendulkar ( pronounced [stin teu...,14066,['sachin ramesh tendulkar pronounced stin teul...,"[cricket, cup, first, india, indian, innings, ...",['sachin ramesh tendulkar pronounced stin teul...,10417
2,Virat Kohli,Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈko...,6372,virat kohli (hindi pronunciation [ koli] born ...,5712,['virat kohli hindi pronunciation koli born 5 ...,"[2017, 2018, also, brand, captain, centuries, ...",['virat kohli hindi pronunciation koli born 5 ...,133
3,Suresh Raina,Suresh Raina (; born 27 November 1986) is an I...,2597,suresh raina ( born 27 november 1986) is an in...,2399,['suresh raina born 27 november 1986 indian fo...,"[2020, also, balls, captain, cricket, cup, due...",['suresh raina born 27 november 1986 indian fo...,1766
4,MS Dhoni,Mahendra Singh Dhoni ( ; born 7 July 1981) is ...,6065,mahendra singh dhoni ( born 7 july 1981) is an...,5653,['mahendra singh dhoni born 7 july 1981 indian...,"[2011, captain, cricket, cup, dhoni, first, ic...",['mahendra singh dhoni born 7 july 1981 indian...,2021


##  We can observe number of tokens have been reduced from 7835 to 7303 and ultimately to 5203 for Virendra Sehwag ,thus helping in Text Summarisation.