In [1]:
pip install spacy


Note: you may need to restart the kernel to use updated packages.


In [1]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ------ --------------------------------- 2.1/12.8 MB 13.0 MB/s eta 0:00:01
     -------------- ------------------------- 4.7/12.8 MB 10.2 MB/s eta 0:00:01
     ---------------- ----------------------- 5.2/12.8 MB 9.8 MB/s eta 0:00:01
     ----------------- ---------------------- 5.5/12.8 MB 6.6 MB/s eta 0:00:02
     ------------------ --------------------- 6.0/12.8 MB 5.5 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 5.5 MB/s eta 0:00:02
     ----------------------- ---------------- 7.6/12.8 MB 5.3 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 5.2 MB/s eta 0:00:01
     --------------------------- ------------ 

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
text = """
In recent years, the field of Natural Language Processing (NLP) has seen tremendous progress.
From machine translation to question answering, NLP is powering the latest AI applications.
Thanks to the availability of large datasets and powerful computing resources,
models like BERT, GPT, and T5 have achieved state-of-the-art performance.
This project demonstrates how to build a simple extractive text summarizer using spaCy.
"""

In [5]:
doc = nlp(text)

In [6]:
tokens = [token.text.lower() for token in doc 
          if not token.is_stop and 
          not token.is_punct and 
          token.text !='\n']

In [7]:
tokens

['recent',
 'years',
 'field',
 'natural',
 'language',
 'processing',
 'nlp',
 'seen',
 'tremendous',
 'progress',
 'machine',
 'translation',
 'question',
 'answering',
 'nlp',
 'powering',
 'latest',
 'ai',
 'applications',
 'thanks',
 'availability',
 'large',
 'datasets',
 'powerful',
 'computing',
 'resources',
 'models',
 'like',
 'bert',
 'gpt',
 't5',
 'achieved',
 'state',
 'art',
 'performance',
 'project',
 'demonstrates',
 'build',
 'simple',
 'extractive',
 'text',
 'summarizer',
 'spacy']

In [8]:
tokens1=[]
stopwords = list(STOP_WORDS)
allowed_pos = ['ADJ','PROPN','VERB','NOUN']
for token in doc:
    if token.text in stopwords or token.text in punctuation:
        continue
    if token.pos_ in allowed_pos:
        tokens1.append(token.text)

In [9]:
tokens1

['recent',
 'years',
 'field',
 'Natural',
 'Language',
 'Processing',
 'NLP',
 'seen',
 'tremendous',
 'progress',
 'machine',
 'translation',
 'question',
 'answering',
 'NLP',
 'powering',
 'latest',
 'AI',
 'applications',
 'Thanks',
 'availability',
 'large',
 'datasets',
 'powerful',
 'computing',
 'resources',
 'models',
 'BERT',
 'GPT',
 'T5',
 'achieved',
 'state',
 'art',
 'performance',
 'project',
 'demonstrates',
 'build',
 'simple',
 'extractive',
 'text',
 'summarizer']

In [10]:
from collections import Counter

In [11]:
word_freq = Counter(tokens)

In [12]:
word_freq

Counter({'nlp': 2,
         'recent': 1,
         'years': 1,
         'field': 1,
         'natural': 1,
         'language': 1,
         'processing': 1,
         'seen': 1,
         'tremendous': 1,
         'progress': 1,
         'machine': 1,
         'translation': 1,
         'question': 1,
         'answering': 1,
         'powering': 1,
         'latest': 1,
         'ai': 1,
         'applications': 1,
         'thanks': 1,
         'availability': 1,
         'large': 1,
         'datasets': 1,
         'powerful': 1,
         'computing': 1,
         'resources': 1,
         'models': 1,
         'like': 1,
         'bert': 1,
         'gpt': 1,
         't5': 1,
         'achieved': 1,
         'state': 1,
         'art': 1,
         'performance': 1,
         'project': 1,
         'demonstrates': 1,
         'build': 1,
         'simple': 1,
         'extractive': 1,
         'text': 1,
         'summarizer': 1,
         'spacy': 1})

In [13]:
max_freq = max(word_freq.values())


In [14]:
max_freq


2

In [15]:
for word in word_freq.keys():
    word_freq[word] = word_freq[word]/max_freq
    

In [16]:
word_freq

Counter({'nlp': 1.0,
         'recent': 0.5,
         'years': 0.5,
         'field': 0.5,
         'natural': 0.5,
         'language': 0.5,
         'processing': 0.5,
         'seen': 0.5,
         'tremendous': 0.5,
         'progress': 0.5,
         'machine': 0.5,
         'translation': 0.5,
         'question': 0.5,
         'answering': 0.5,
         'powering': 0.5,
         'latest': 0.5,
         'ai': 0.5,
         'applications': 0.5,
         'thanks': 0.5,
         'availability': 0.5,
         'large': 0.5,
         'datasets': 0.5,
         'powerful': 0.5,
         'computing': 0.5,
         'resources': 0.5,
         'models': 0.5,
         'like': 0.5,
         'bert': 0.5,
         'gpt': 0.5,
         't5': 0.5,
         'achieved': 0.5,
         'state': 0.5,
         'art': 0.5,
         'performance': 0.5,
         'project': 0.5,
         'demonstrates': 0.5,
         'build': 0.5,
         'simple': 0.5,
         'extractive': 0.5,
         'text': 0.5,
    

In [17]:
sent_token = [sent.text for sent in doc.sents]

In [18]:
sent_token

['\nIn recent years, the field of Natural Language Processing (NLP) has seen tremendous progress.\n',
 'From machine translation to question answering, NLP is powering the latest AI applications.\n',
 'Thanks to the availability of large datasets and powerful computing resources,\nmodels like BERT, GPT, and T5 have achieved state-of-the-art performance.\n',
 'This project demonstrates how to build a simple extractive text summarizer using spaCy.\n']

In [19]:
sent_score = {}
for sent in sent_token:
    for word in sent.split():
        if word.lower() in word_freq.keys():
            if sent not in sent_score.keys():
                sent_score[sent] = word_freq[word]
            else:
                sent_score[sent] +=word_freq[word]
        print(word)

In
recent
years,
the
field
of
Natural
Language
Processing
(NLP)
has
seen
tremendous
progress.
From
machine
translation
to
question
answering,
NLP
is
powering
the
latest
AI
applications.
Thanks
to
the
availability
of
large
datasets
and
powerful
computing
resources,
models
like
BERT,
GPT,
and
T5
have
achieved
state-of-the-art
performance.
This
project
demonstrates
how
to
build
a
simple
extractive
text
summarizer
using
spaCy.


In [20]:
sent_score

{'\nIn recent years, the field of Natural Language Processing (NLP) has seen tremendous progress.\n': 2.0,
 'From machine translation to question answering, NLP is powering the latest AI applications.\n': 2.5,
 'Thanks to the availability of large datasets and powerful computing resources,\nmodels like BERT, GPT, and T5 have achieved state-of-the-art performance.\n': 4.0,
 'This project demonstrates how to build a simple extractive text summarizer using spaCy.\n': 3.5}

In [21]:
import pandas as pd

In [22]:
pd.DataFrame(list(sent_score.items()),columns=['Sentence','Score'])


Unnamed: 0,Sentence,Score
0,"\nIn recent years, the field of Natural Langua...",2.0
1,From machine translation to question answering...,2.5
2,Thanks to the availability of large datasets a...,4.0
3,This project demonstrates how to build a simpl...,3.5


In [23]:
pd.DataFrame(list(sent_score.items()),columns=['Sentence','Score'])


Unnamed: 0,Sentence,Score
0,"\nIn recent years, the field of Natural Langua...",2.0
1,From machine translation to question answering...,2.5
2,Thanks to the availability of large datasets a...,4.0
3,This project demonstrates how to build a simpl...,3.5


In [24]:
from heapq import nlargest


In [25]:
num_sentences =3
n = nlargest(num_sentences,sent_score,key=sent_score.get)

In [26]:
" ".join(n)

'Thanks to the availability of large datasets and powerful computing resources,\nmodels like BERT, GPT, and T5 have achieved state-of-the-art performance.\n This project demonstrates how to build a simple extractive text summarizer using spaCy.\n From machine translation to question answering, NLP is powering the latest AI applications.\n'

In [33]:
# Install required model if not done already
!python -m spacy download en_core_web_sm

# Import libraries
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = """
In recent years, the field of Natural Language Processing (NLP) has seen tremendous progress.
From machine translation to question answering, NLP is powering the latest AI applications.
Thanks to the availability of large datasets and powerful computing resources,
models like BERT, GPT, and T5 have achieved state-of-the-art performance.
This project demonstrates how to build a simple extractive text summarizer using spaCy.
"""

# Process text
doc = nlp(text)

# Calculate word frequencies
word_frequencies = {}
for word in doc:
    if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
        word_text = word.text.lower()
        word_frequencies[word_text] = word_frequencies.get(word_text, 0) + 1

# Normalize frequencies
max_freq = max(word_frequencies.values())
for word in word_frequencies:
    word_frequencies[word] /= max_freq

# Score each sentence
sentence_scores = {}
for sent in doc.sents:
    for word in sent:
        if word.text.lower() in word_frequencies:
            sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word.text.lower()]

# Select top N sentences for summary
summary_sentences = nlargest(3, sentence_scores, key=sentence_scores.get)
final_summary = ' '.join([sent.text for sent in summary_sentences])

# Output
print("===== Summary =====")
print(final_summary)


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
===== Summary =====
Thanks to the availability of large datasets and powerful computing resources,
models like BERT, GPT, and T5 have achieved state-of-the-art performance.
 
In recent years, the field of Natural Language Processing (NLP) has seen tremendous progress.
 From machine translation to question answering, NLP is powering the latest AI applications.

