In [37]:
import spacy
from nltk.corpus import stopwords
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS
from heapq import nlargest

In [38]:
def text_gen(text_data):
  stop_words = list(STOP_WORDS)
  nlp = spacy.load('en_core_web_sm')
  doc = nlp(text)
  return doc

In [39]:
def frequency_table(text_data):
  
  stop_words = list(STOP_WORDS)
  doc = text_gen(text_data)
  tokens = [token.text for token in doc]      # building list of the token
  punct  = punctuation + '\n'           #  adding /n also in punctuation list as it is not presnt in that pre-defined library
  # creating word frequency table  i.e the count of each word
  word_frequencies = {}
  for word in doc:
    if word.text.lower() not in stop_words:
      if word.text.lower() not in punct:
        if word.text not in word_frequencies.keys():  # even it should not be present in the keys of dictionary
          word_frequencies[word.text] = 1
        else:
          word_frequencies[word.text] += 1
  

  # Taking the maximum frequency 
  max_frequency = max(word_frequencies.values())

  # Creating frequency table
  for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_frequency
  return word_frequencies


In [43]:
def text_summarization(text_data):

  doc = text_gen(text_data)

  word_frequencies = frequency_table(text_data)

  sentence_tokens = [sent for sent in doc.sents]
  
  sentence_scores = {}
  for sent in sentence_tokens:
    for word in sent:
      if word.text.lower() in word_frequencies.keys():
        # here we are going to add the word of maximum value sentences
        if sent not in sentence_scores.keys():
          sentence_scores[sent] = word_frequencies[word.text.lower()]
        else:
          sentence_scores[sent] += word_frequencies[word.text.lower()]
  # Selecting the total length of 
  select_length = int(len(sentence_tokens) * 0.3)    # here we are selecting only 30% of the total sentences
  # We need to select 6 sentences having  maximum frequency count
  summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)

  # Now we need to combine these sentences
  final_summary = [word.text for word in summary]


  summarization = " ".join(final_summary)
  return  summarization


In [44]:
text_data = ''' 
Anushka Sharma confesses she's not having the easiest time learning cricketAnushka Sharma has shared a new selfie from the prep for her upcoming cricket-based film Chakda Express. The actor is evidently having a tough time getting the hang of cricket moves.Anushka Sharma is not having the easiest time polishing. her cricket skills before she begins shoot for Chakda 'Xpress. On Wednesday, she shared a selfie as she took a short rest after a practice session. The selfie showed her in a white T-shirt with her hair tied in the back. She had her fist against her lips and looked in the camera. (Also read: Anushka Sharma reveals she turns to husband Virat Kohli for batting tips as she preps for Chakda Xpress)“Kaash bachpan mein kuch toh cricket khela hota toh aaj haalat aise na hoti (I wish I had played some cricket in my childhood, perhaps then I would not have suffered like this today),” she wrote with her photo. Anushka plays cricket star Jhulan Goswami in the movie.Chakda 'Xpress was announced in January with a teaser. Talking about the film, Anushka wrote in her post: “Chakda Xpress is inspired by the life and times of former Indian captain Jhulan Goswami and it will be an eye-opener into the world of women’s cricket. At a time when Jhulan decided to become a cricketer and make her country proud on the global stage, it was very tough for women to even think of playing the sport. This film is a dramatic retelling of several instances that shaped her life and also women’s cricket.” Directed by Prosit Roy, Chakda Xpress is based on the life of Jhulan, who became the second Indian woman cricketer to receive the Padma Shri, in 2012.The film marks Anushka's comeback to acting after four years. She was last seen with Shah Rukh Khan in Zero, which released in 2018. She took a long break after that and even welcomed her first child, daughter Vamika.Anushka recently told Harper’s Bazaar in an interview that she and cricketer-husband Virat Kohli discuss her progress for her new film, as she shows him her cricket videos. She said, "We definitely discuss my progress. Whenever I’ve had a good day learning, I like to share my videos with Virat, to get his feedback. Luckily, he’s not a bowler so I listen to my coach more. But I do turn to Virat for batting tips. "
'''

In [45]:
text_summarization(text_data)

'In this course you’ll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches. While some of spaCy’s features work independently, others require trained pipelines to be loaded, which enable spaCy to predict linguistic annotations – for example, whether a word is a verb or a noun. A trained pipeline can consist of multiple components that use a statistical model trained on labeled data. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.\n spaCy is designed specifically for production use and helps you build applications that process and For a general-purpose use case, the small, default packages are always a good start\n'

In this course you’ll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches. While some of spaCy’s features work independently, others require trained pipelines to be loaded, which enable spaCy to predict linguistic annotations – for example, whether a word is a verb or a noun. A trained pipeline can consist of multiple components that use a statistical model trained on labeled data. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.
 spaCy is designed specifically for production use and helps you build applications that process and For a general-purpose use case, the small, default packages are always a good start


# For cross checking

In [31]:
print(len(text_data))
print(len(text_summarization(text_data)))




# As we can have a clear vision i.e. summarized text is almost around 30% of actuall text


1909
785
