In [16]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords, treebank
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.tag import DefaultTagger, RegexpTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.chunk import tree2conlltags
from nltk.classify import MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger
from nltk.util import ngrams

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/

True

In [6]:
# Sample web scraping
url = "https://www.bbc.com/news/business-64303166"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
raw_text = soup.get_text()

# Clean and preprocess text
text = re.sub(r'[^a-zA-Z0-9\s]', '', raw_text)
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Statistics
unique_tokens = len(set(tokens))
total_lemmas = len(lemmatized_tokens)
unique_lemmas = len(set(lemmatized_tokens))

from nltk.tokenize.punkt import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()
sentences = tokenizer.tokenize(raw_text)
num_sentences = len(sentences)
avg_words = total_tokens / num_sentences if num_sentences else 0

print("Total tokens:", total_tokens)
print("Unique tokens:", unique_tokens)
print("Lemmatized tokens:", total_lemmas)
print("Unique lemmatized tokens:", unique_lemmas)
print("Number of sentences:", num_sentences)
print("Average words per sentence:", avg_words)

Total tokens: 304
Unique tokens: 262
Lemmatized tokens: 304
Unique lemmatized tokens: 261
Number of sentences: 10
Average words per sentence: 30.4


In [7]:
tagged_sentences = treebank.tagged_sents()
split = int(0.8 * len(tagged_sentences))
train_data = tagged_sentences[:split]
test_data = tagged_sentences[split:]


In [8]:
default_tagger = DefaultTagger('NN')
accuracy_default = default_tagger.evaluate(test_data)
print("Default Tagger Accuracy:", accuracy_default)


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy_default = default_tagger.evaluate(test_data)


Default Tagger Accuracy: 0.1447677029791906


In [9]:
patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),   # past tense verbs
    (r'.*es$', 'VBZ'),   # 3rd person singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'POS'),  # possessives
    (r'^-?[0-9]+$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')        # default noun
]
regex_tagger = RegexpTagger(patterns)
print("Regex Tagger Accuracy:", regex_tagger.evaluate(test_data))


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Regex Tagger Accuracy:", regex_tagger.evaluate(test_data))


Regex Tagger Accuracy: 0.20190628274864014


In [10]:
unigram_tagger = UnigramTagger(train_data)
print("Unigram Accuracy:", unigram_tagger.evaluate(test_data))

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print("Bigram Accuracy:", bigram_tagger.evaluate(test_data))

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print("Trigram Accuracy:", trigram_tagger.evaluate(test_data))


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Unigram Accuracy:", unigram_tagger.evaluate(test_data))


Unigram Accuracy: 0.8608213982733669


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Bigram Accuracy:", bigram_tagger.evaluate(test_data))


Bigram Accuracy: 0.8679075802185737


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Trigram Accuracy:", trigram_tagger.evaluate(test_data))


Trigram Accuracy: 0.8678077748390638


In [11]:
combined_tagger = TrigramTagger(train_data,
                     backoff=BigramTagger(train_data,
                     backoff=UnigramTagger(train_data,
                     backoff=DefaultTagger('NN'))))
print("Combined Tagger Accuracy:", combined_tagger.evaluate(test_data))


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Combined Tagger Accuracy:", combined_tagger.evaluate(test_data))


Combined Tagger Accuracy: 0.8844253705274714


In [12]:
classifier_tagger = ClassifierBasedPOSTagger(train=train_data, classifier_builder=MaxentClassifier.train)
print("Classifier-Based Tagger Accuracy:", classifier_tagger.evaluate(test_data))


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.82864        0.008
             2          -0.76705        0.957


  exp_nf_delta = 2**nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)


         Final               nan        0.984


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Classifier-Based Tagger Accuracy:", classifier_tagger.evaluate(test_data))


Classifier-Based Tagger Accuracy: 0.9277908079245472


In [17]:
# Use a small sample or text from earlier steps
sample = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California."

# Check if the resource is available before using it
try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
    print("averaged_perceptron_tagger_eng found.")
except LookupError:
    print("averaged_perceptron_tagger_eng not found. Please run the first cell to download resources.")

# POS tagging
pos_tags = pos_tag(word_tokenize(sample))

# Named Entity Recognition
tree = ne_chunk(pos_tags)

# Print named entities
print("Named Entities:")
print(tree)

# Extracting named entities in IOB format
iob = tree2conlltags(tree)
print("\nNamed Entities (IOB format):")
for word, pos, ner in iob:
    if ner != 'O':
        print(f"{word}: {ner}")

averaged_perceptron_tagger_eng found.
Named Entities:
(S
  (PERSON Apple/NNP)
  (ORGANIZATION Inc./NNP)
  was/VBD
  founded/VBN
  by/IN
  (PERSON Steve/NNP Jobs/NNP)
  and/CC
  (PERSON Steve/NNP Wozniak/NNP)
  in/IN
  (GPE California/NNP)
  ./.)

Named Entities (IOB format):
Apple: B-PERSON
Inc.: B-ORGANIZATION
Steve: B-PERSON
Jobs: I-PERSON
Steve: B-PERSON
Wozniak: I-PERSON
California: B-GPE
