
Install the required libraries:


In [11]:
!pip install nltk spacy
!pip install gensim


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import the necessary modules:

In [12]:
import nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import spacy
from spacy import displacy


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Text Cleaning and Tokenization:

In [13]:
# Download stopwords
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize each sentence into words
    tokens = [word_tokenize(sentence) for sentence in sentences]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [[token for token in sentence if token not in stop_words] for sentence in tokens]

    # Remove non-alphabetic characters
    tokens = [[token for token in sentence if token.isalpha()] for sentence in tokens]

    return tokens


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tag Extraction and Named Entity Recognition (NER)

In [14]:
# Download spaCy model
!python -m spacy download en_core_web_sm

# Load the English model
nlp = spacy.load('en_core_web_sm')

def extract_tags_ner(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract POS tags
    tags = [(token.text, token.pos_) for token in doc]

    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    return tags, entities


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Summary Generation:

In [None]:
def generate_summary(text):
    # Generate a summary using TextRank algorithm
    summary = summarize(text)

    return summary


Usage:

In [15]:
# Example text
text = "John Smith is the CEO of XYZ Corporation. He has extensive experience in the technology industry. XYZ Corporation is a leading tech company specializing in software development."

# Clean the text and tokenize
tokens = clean_text(text)
print("Tokens:", tokens)

# Extract POS tags and named entities
tags, entities = extract_tags_ner(text)
print("POS Tags:", tags)
print("Named Entities:", entities)

# Generate a summary
# summary = generate_summary(text)
# print("Summary:", summary)


Tokens: [['john', 'smith', 'ceo', 'xyz', 'corporation'], ['extensive', 'experience', 'technology', 'industry'], ['xyz', 'corporation', 'leading', 'tech', 'company', 'specializing', 'software', 'development']]
POS Tags: [('John', 'PROPN'), ('Smith', 'PROPN'), ('is', 'AUX'), ('the', 'DET'), ('CEO', 'PROPN'), ('of', 'ADP'), ('XYZ', 'PROPN'), ('Corporation', 'PROPN'), ('.', 'PUNCT'), ('He', 'PRON'), ('has', 'VERB'), ('extensive', 'ADJ'), ('experience', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('technology', 'NOUN'), ('industry', 'NOUN'), ('.', 'PUNCT'), ('XYZ', 'PROPN'), ('Corporation', 'PROPN'), ('is', 'AUX'), ('a', 'DET'), ('leading', 'VERB'), ('tech', 'NOUN'), ('company', 'NOUN'), ('specializing', 'VERB'), ('in', 'ADP'), ('software', 'NOUN'), ('development', 'NOUN'), ('.', 'PUNCT')]
Named Entities: [('John Smith', 'PERSON'), ('XYZ Corporation', 'ORG'), ('XYZ Corporation', 'ORG')]
