**Lab 3 - POS tagging and NER**

Submitted by: Angeline A

Submitted on: 23/08/2024

In [6]:
pip install youtube-transcript-api transformers nltk spacy




In [8]:

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
import math
from collections import Counter, defaultdict
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
import spacy
import nltk
from nltk.tag import hmm
from nltk.corpus import treebank

In [11]:
# Ensure necessary NLTK data is downloaded
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [14]:
# Step 1: Extract, Clean, and Punctuate Transcript

def get_transcript(video_id):
    """Extracts transcript from a YouTube video given its ID."""
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return " ".join([entry['text'] for entry in transcript])

def clean_and_punctuate(transcript):
    """Cleans and punctuates the transcript using a pre-trained model."""
    # Use T5 model for punctuation restoration
    punctuator = pipeline("text2text-generation", model="t5-small")

    # Chunking the text to avoid exceeding token limits
    max_length = 512  # Adjust based on the model's capabilities
    chunks = [transcript[i:i+max_length] for i in range(0, len(transcript), max_length)]

    punctuated_transcript = ""
    for chunk in chunks:
        cleaned_chunk = chunk.lower()  # Lowercasing for consistent punctuation
        punctuated_chunk = punctuator(cleaned_chunk, max_length=512)[0]['generated_text']
        punctuated_transcript += punctuated_chunk + " "

    return punctuated_transcript.strip()



In [15]:
video_id = "W6wVU5b5nQk"  # Replace with the actual video ID
raw_transcript = get_transcript(video_id)
print("Raw Transcript:\n", raw_transcript)

cleaned_punctuated_transcript = clean_and_punctuate(raw_transcript)
print("Cleaned and Punctuated Transcript:\n", cleaned_punctuated_transcript)

Raw Transcript:
 foreign [Music] once upon a time in a small village there lived a wise old Monk he was known far and wide for his wisdom and sense of humor one day a young and eager student named Sam approached the master and said master I want to learn the secret to happiness and success please teach me master Sito looked at Sam with a twinkle in his eye and said very well young one But first you must complete a simple task go to the market and buy the biggest juiciest watermelon you can find then carry it on your head and walk through the village without dropping it Sam was puzzled but determined he went to the market and found a massive watermelon balancing it on his head he walked through the village with utmost concentration as he passed by people couldn't help but laugh and cheer him on some even joined in clapping and making funny faces finally after a bumpy Journey Sam reached Master setu's Hut the watermelon was intact and Sam was relieved he looked at Master situ expecting t

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Cleaned and Punctuated Transcript:
 a young and eager student named sam approached the master and said master i want to learn the secret to happiness and success please teach me master sito looked at sam with a twinkle in his eye and said very well young one but first you must complete a simple task go to the market and buy the biggest juiciest watermelon you can find then carry it on your head an ice cream . master setu's hut the watermelon was intact and sam was relieved he looked at master situ expecting to be praised for his accomplishment . sam was puzzled but determined he went to the market and found a massive watermelon balancing it on his head he walked through the village with utmost concentration . sam was determined he went to the market and found a massive watermelon balancing it on his head he walked through the he sat in laffer well then young won he exclaimed wiping tears of mirth from his eyes he exclaimed wiping tears of mirth from his eyes he exclaimed wiping tears o

In [16]:
# Step 2: N-gram Probabilities

def unigram_probabilities(transcript):
    """Calculates unigram probabilities from the transcript."""
    words = transcript.split()
    total_words = len(words)
    word_counts = Counter(words)
    unigram_probs = {word: count / total_words for word, count in word_counts.items()}
    return unigram_probs

def bigram_probabilities(transcript):
    """Calculates bigram probabilities from the transcript."""
    words = transcript.split()
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
    unigram_counts = Counter(words)
    bigram_counts = Counter(bigrams)

    bigram_probs = defaultdict(float)
    for (w1, w2), count in bigram_counts.items():
        bigram_probs[(w1, w2)] = count / unigram_counts[w1]
    return bigram_probs

def calculate_bigram_probability(bigram_probs, word1, word2):
    """Calculates the probability of a bigram (word1, word2)."""
    return bigram_probs.get((word1, word2), 0.0)

In [17]:
unigram_probs = unigram_probabilities(cleaned_punctuated_transcript)
print("Unigram Probabilities:\n", unigram_probs)

Unigram Probabilities:
 {'a': 0.022082018927444796, 'young': 0.00946372239747634, 'and': 0.03785488958990536, 'eager': 0.0031545741324921135, 'student': 0.0031545741324921135, 'named': 0.0031545741324921135, 'sam': 0.022082018927444796, 'approached': 0.006309148264984227, 'the': 0.04416403785488959, 'master': 0.022082018927444796, 'said': 0.006309148264984227, 'i': 0.0031545741324921135, 'want': 0.0031545741324921135, 'to': 0.031545741324921134, 'learn': 0.0031545741324921135, 'secret': 0.0031545741324921135, 'happiness': 0.0031545741324921135, 'success': 0.006309148264984227, 'please': 0.0031545741324921135, 'teach': 0.0031545741324921135, 'me': 0.0031545741324921135, 'sito': 0.0031545741324921135, 'looked': 0.006309148264984227, 'at': 0.006309148264984227, 'with': 0.01892744479495268, 'twinkle': 0.0031545741324921135, 'in': 0.01892744479495268, 'his': 0.022082018927444796, 'eye': 0.0031545741324921135, 'very': 0.0031545741324921135, 'well': 0.006309148264984227, 'one': 0.003154574132

In [18]:
bigram_probs = bigram_probabilities(cleaned_punctuated_transcript)
print("Bigram Probabilities:\n", bigram_probs)

Bigram Probabilities:
 defaultdict(<class 'float'>, {('a', 'young'): 0.14285714285714285, ('young', 'and'): 0.3333333333333333, ('and', 'eager'): 0.08333333333333333, ('eager', 'student'): 1.0, ('student', 'named'): 1.0, ('named', 'sam'): 1.0, ('sam', 'approached'): 0.14285714285714285, ('approached', 'the'): 0.5, ('the', 'master'): 0.07142857142857142, ('master', 'and'): 0.14285714285714285, ('and', 'said'): 0.16666666666666666, ('said', 'master'): 0.5, ('master', 'i'): 0.14285714285714285, ('i', 'want'): 1.0, ('want', 'to'): 1.0, ('to', 'learn'): 0.1, ('learn', 'the'): 1.0, ('the', 'secret'): 0.07142857142857142, ('secret', 'to'): 1.0, ('to', 'happiness'): 0.1, ('happiness', 'and'): 1.0, ('and', 'success'): 0.16666666666666666, ('success', 'please'): 0.5, ('please', 'teach'): 1.0, ('teach', 'me'): 1.0, ('me', 'master'): 1.0, ('master', 'sito'): 0.14285714285714285, ('sito', 'looked'): 1.0, ('looked', 'at'): 1.0, ('at', 'sam'): 0.5, ('sam', 'with'): 0.14285714285714285, ('with', 'a'):

In [22]:
p_very_well = calculate_bigram_probability(bigram_probs, "very", "well")
p_the_master = calculate_bigram_probability(bigram_probs, "the", "master")

print(f"P(very/well): {p_very_well}")
print(f"P(the/master): {p_the_master}")

P(very/well): 1.0
P(the/master): 0.07142857142857142


In [23]:
# Step 3: Compute Perplexity

def compute_perplexity(test_text, bigram_probs):
    """Computes the perplexity of the test text given bigram probabilities."""
    words = test_text.split()
    N = len(words)
    log_prob_sum = 0

    for i in range(len(words) - 1):
        prob = bigram_probs.get((words[i], words[i+1]), 1e-6)  # Smoothing with a small value
        log_prob_sum += math.log(prob)

    perplexity = math.exp(-log_prob_sum / N)
    return perplexity

In [25]:
perplexity = compute_perplexity(cleaned_punctuated_transcript, bigram_probs)
print(f"Perplexity of the test set: {perplexity}")

Perplexity of the test set: 2.2132803261449943


In [26]:
# Step 4: POS Tagging

def pos_tagging(transcript):
    """Performs POS tagging on the transcript."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(transcript)
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

In [27]:
pos_tags = pos_tagging(cleaned_punctuated_transcript)
print("POS Tags:\n", pos_tags)


POS Tags:
 [('a', 'DET'), ('young', 'ADJ'), ('and', 'CCONJ'), ('eager', 'ADJ'), ('student', 'NOUN'), ('named', 'VERB'), ('sam', 'PROPN'), ('approached', 'VERB'), ('the', 'DET'), ('master', 'NOUN'), ('and', 'CCONJ'), ('said', 'VERB'), ('master', 'NOUN'), ('i', 'PRON'), ('want', 'VERB'), ('to', 'PART'), ('learn', 'VERB'), ('the', 'DET'), ('secret', 'NOUN'), ('to', 'ADP'), ('happiness', 'NOUN'), ('and', 'CCONJ'), ('success', 'NOUN'), ('please', 'INTJ'), ('teach', 'VERB'), ('me', 'PRON'), ('master', 'NOUN'), ('sito', 'PROPN'), ('looked', 'VERB'), ('at', 'ADP'), ('sam', 'PROPN'), ('with', 'ADP'), ('a', 'DET'), ('twinkle', 'NOUN'), ('in', 'ADP'), ('his', 'PRON'), ('eye', 'NOUN'), ('and', 'CCONJ'), ('said', 'VERB'), ('very', 'ADV'), ('well', 'ADV'), ('young', 'ADJ'), ('one', 'NUM'), ('but', 'CCONJ'), ('first', 'ADV'), ('you', 'PRON'), ('must', 'AUX'), ('complete', 'VERB'), ('a', 'DET'), ('simple', 'ADJ'), ('task', 'NOUN'), ('go', 'VERB'), ('to', 'ADP'), ('the', 'DET'), ('market', 'NOUN'), ('a

In [28]:
# Step 5: HMM Tagger

def train_hmm_tagger():
    """Trains an HMM tagger using the NLTK treebank corpus."""
    train_data = treebank.tagged_sents()  # Using NLTK's treebank corpus
    trainer = hmm.HiddenMarkovModelTrainer()
    hmm_tagger = trainer.train(train_data)
    return hmm_tagger

def hmm_tagging(tagger, transcript):
    """Tags the transcript using the trained HMM tagger."""
    tokens = transcript.split()
    tagged_sentence = tagger.tag(tokens)
    return tagged_sentence

In [29]:
hmm_tagger = train_hmm_tagger()
tagged_sentence = hmm_tagging(hmm_tagger, cleaned_punctuated_transcript)
print("HMM Tagged Sentence:\n", tagged_sentence)

  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)


HMM Tagged Sentence:
 [('a', 'DT'), ('young', 'JJ'), ('and', 'CC'), ('eager', 'JJ'), ('student', 'NN'), ('named', 'VBD'), ('sam', 'NNP'), ('approached', 'NNP'), ('the', 'NNP'), ('master', 'NNP'), ('and', 'NNP'), ('said', 'NNP'), ('master', 'NNP'), ('i', 'NNP'), ('want', 'NNP'), ('to', 'NNP'), ('learn', 'NNP'), ('the', 'NNP'), ('secret', 'NNP'), ('to', 'NNP'), ('happiness', 'NNP'), ('and', 'NNP'), ('success', 'NNP'), ('please', 'NNP'), ('teach', 'NNP'), ('me', 'NNP'), ('master', 'NNP'), ('sito', 'NNP'), ('looked', 'NNP'), ('at', 'NNP'), ('sam', 'NNP'), ('with', 'NNP'), ('a', 'NNP'), ('twinkle', 'NNP'), ('in', 'NNP'), ('his', 'NNP'), ('eye', 'NNP'), ('and', 'NNP'), ('said', 'NNP'), ('very', 'NNP'), ('well', 'NNP'), ('young', 'NNP'), ('one', 'NNP'), ('but', 'NNP'), ('first', 'NNP'), ('you', 'NNP'), ('must', 'NNP'), ('complete', 'NNP'), ('a', 'NNP'), ('simple', 'NNP'), ('task', 'NNP'), ('go', 'NNP'), ('to', 'NNP'), ('the', 'NNP'), ('market', 'NNP'), ('and', 'NNP'), ('buy', 'NNP'), ('the', 

  O[i, k] = self._output_logprob(si, self._symbols[k])


In [30]:
# Step 6: Named Entity Recognition (NER)

def named_entity_recognition(transcript):
    """Performs Named Entity Recognition on the transcript."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(transcript)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

In [31]:
entities = named_entity_recognition(cleaned_punctuated_transcript)
print("Named Entities:\n", entities)

Named Entities:
 [('sam', 'PERSON'), ('sito', 'PERSON'), ('sam', 'PERSON'), ('first', 'ORDINAL'), ('sam', 'PERSON'), ('sam', 'PERSON'), ('sam', 'PERSON'), ('sam', 'PERSON'), ('that day', 'DATE'), ('sam', 'PERSON'), ('sito', 'PERSON'), ('für', 'ORG')]
