## GloVe Embedding

### Load GloVe

In [9]:
import numpy as np

In [10]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Example usage
glove_file_path = r"/Users/deepak/Desktop/glove/glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)



### Access the embedding vector for a word 'peacock', 'love', 'chickpeas'

In [11]:
# Now you can access the embedding vector for a word
word = "peacock"
embedding_vector = glove_embeddings.get(word)
if embedding_vector is not None:
    print(f"Embedding for '{word}': {embedding_vector}")
else:
    print(f"No embedding found for '{word}'")

Embedding for 'peacock': [ 0.029963  -0.16909   -0.63342   -1.1576     0.11979   -0.13036
 -0.28968   -0.38779   -0.49356   -0.5426    -0.40903    0.73576
  0.46243    0.34581    0.12689   -0.33586    0.56195   -0.21438
  0.81105   -0.30931   -0.082504  -0.015896   0.58499   -0.80602
  0.97007    0.80006    0.21865    0.5179    -0.43745   -0.40369
  0.33668    0.77283    0.5078    -0.45089    0.16236    0.50515
 -0.15933    0.57814    0.42695   -0.27231    0.030063   0.4434
 -0.49498    0.31009    0.052545   0.19662   -0.26147   -0.26061
  0.50359    0.13991   -0.1539    -0.17887   -0.0023244  0.30874
  0.067166  -1.2352    -0.054302   0.24626   -0.48042    0.080364
  0.1315    -0.069248  -0.06334    0.90979   -0.44672    0.05422
 -0.17182    0.26997   -0.071352   0.29887   -0.40761   -0.073718
 -0.29886    0.055359   0.2199    -0.33853   -0.3422     0.27173
 -0.052196   0.087092  -0.51495   -0.462      0.57484   -0.10106
  0.20873   -0.069572   0.30728    0.037524   0.243      0.34245

In [12]:
words = ['peacock', 'love', 'chickpeas']
for word in words:
    print(glove_embeddings.get(word))

[ 0.029963  -0.16909   -0.63342   -1.1576     0.11979   -0.13036
 -0.28968   -0.38779   -0.49356   -0.5426    -0.40903    0.73576
  0.46243    0.34581    0.12689   -0.33586    0.56195   -0.21438
  0.81105   -0.30931   -0.082504  -0.015896   0.58499   -0.80602
  0.97007    0.80006    0.21865    0.5179    -0.43745   -0.40369
  0.33668    0.77283    0.5078    -0.45089    0.16236    0.50515
 -0.15933    0.57814    0.42695   -0.27231    0.030063   0.4434
 -0.49498    0.31009    0.052545   0.19662   -0.26147   -0.26061
  0.50359    0.13991   -0.1539    -0.17887   -0.0023244  0.30874
  0.067166  -1.2352    -0.054302   0.24626   -0.48042    0.080364
  0.1315    -0.069248  -0.06334    0.90979   -0.44672    0.05422
 -0.17182    0.26997   -0.071352   0.29887   -0.40761   -0.073718
 -0.29886    0.055359   0.2199    -0.33853   -0.3422     0.27173
 -0.052196   0.087092  -0.51495   -0.462      0.57484   -0.10106
  0.20873   -0.069572   0.30728    0.037524   0.243      0.34245
  0.077403  -0.42687    

In [13]:
embedding = glove_embeddings.get('parth')
embedding.size

100

### Unknwon!

In [14]:
word = "randomwordthatmightnotfound!"
embedding_vector = glove_embeddings.get(word)
if embedding_vector is not None:
    print(f"Embedding for '{word}': {embedding_vector}")
else:
    print(f"No embedding found for '{word}'")

No embedding found for 'randomwordthatmightnotfound!'


## Load Synthetic Data Sample from CNN/DailyMail

In [15]:
import pandas as pd

In [16]:
synthetic_dataset = pd.read_csv("synthetic_dataset.csv")

### Preprocessing

In [17]:
synthetic_dataset = synthetic_dataset.drop(synthetic_dataset.columns[[0]], axis=1)
synthetic_dataset.rename(columns={'Unnamed: 2':'topic', 'Unnamed: 3':'headline'}, inplace=True )

### Overview

In [18]:
synthetic_dataset

Unnamed: 0,article,topic,headline
0,"By . Harriet Arkell . PUBLISHED: . 06:42 EST, ...",Injury,Toddler's Severe Nursery Injury Sparks Legal B...
1,"By . Harriet Arkell . PUBLISHED: . 06:42 EST, ...",Legal,Family Pursues Legal Action After Child's Nurs...
2,"By . Harriet Arkell . PUBLISHED: . 06:42 EST, ...",Emotion,Emotional Turmoil Follows Child's Nursery Acci...
3,(CNN) -- Cold weather has delayed spring in Wa...,Gridlock,Washington Thaws: Ice of Gridlock Begins to Melt
4,(CNN) -- Cold weather has delayed spring in Wa...,Bipartisanship,Bipartisan Progress: Senate Gains Momentum on ...
5,(CNN) -- Cold weather has delayed spring in Wa...,Senate,Senate Shift: Renewed Focus on Legislative Eff...
6,"By . Leon Watson . PUBLISHED: . 10:43 EST, 19 ...",Clashes,Ethnic Clashes Escalate: Kurds and Rebels Clas...
7,"By . Leon Watson . PUBLISHED: . 10:43 EST, 19 ...",Kurds,Kurdish Assertiveness: Kurds Seek Self-Rule Am...
8,"By . Leon Watson . PUBLISHED: . 10:43 EST, 19 ...",Rebels,Rebel Advance Meets Kurdish Resistance in Syri...
9,"LONDON, England (CNN) -- Picture this: you're ...",Holographic Technology,Breakthrough in Holographic Technology Paves W...


In [19]:
##Load the pretrained model
from transformers import pipeline
# from .autonotebook import tqdm as notebook_tqdm
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


  from .autonotebook import tqdm as notebook_tqdm
2024-05-12 01:19:23.176811: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
print(summarizer(synthetic_dataset['article'][0], max_length=130, min_length=30, do_sample=False))

[{'summary_text': "Harrison Farrell, 5, suffered second-degree burns to his right leg at Kids Academy. His mother Katie Brewster, 30, from Leeds, was sent £634 bill for failing to give one month's notice. Family awarded five-figure sum after Kids Academy's insurers admitted liability."}]


In [21]:
import spacy


model_path = "/Users/deepak/Desktop/en_core_web_sm/en_core_web_sm-3.7.1"


nlp = spacy.load(model_path)

In [22]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

article_text = synthetic_dataset['article'][0]
words = tokenize_text(article_text)


In [29]:
len(words)

955

In [24]:
word_embeddings = {word:glove_embeddings.get(word) for word in words}

In [49]:
target_word = "Injury".lower()
target_word_2 = "Legal".lower()
target_word_3 = "Emotion".lower()

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(glove_embeddings, word_embeddings, target_word):
    similarities = {}
    target_word_embedding = glove_embeddings.get(target_word)
    if target_word_embedding is None:
        return similarities 

    for word, embedding in word_embeddings.items():
        if embedding is not None and word in glove_embeddings:
            similarity = cosine_similarity([embedding], [target_word_embedding])[0][0]
            similarities[word] = similarity
        else:
            similarities[word] = 1  
    return similarities

threshold_similarity = 0.42


similarities = calculate_similarity(glove_embeddings, word_embeddings, target_word)

# Filter words based on similarity threshold
similar_words = {}
for word, similarity in similarities.items():
    if similarity >= threshold_similarity or word not in glove_embeddings:
        similar_words[word] = similarity


for word, similarity in similar_words.items():
    print(f"Word: {word}, Similarity: {similarity}")

Word: By, Similarity: 1
Word: ., Similarity: 0.45306161046028137
Word: Harriet, Similarity: 1
Word: Arkell, Similarity: 1
Word: PUBLISHED, Similarity: 1
Word: 06:42, Similarity: 1
Word: EST, Similarity: 1
Word: July, Similarity: 1
Word: UPDATED, Similarity: 1
Word: 08:26, Similarity: 1
Word: A, Similarity: 1
Word: suffered, Similarity: 0.6841134428977966
Word: burns, Similarity: 0.4401872158050537
Word: when, Similarity: 0.5057186484336853
Word: his, Similarity: 0.4944289028644562
Word: leg, Similarity: 0.704346776008606
Word: Staff, Similarity: 1
Word: Kids, Similarity: 1
Word: Academy, Similarity: 1
Word: Leeds, Similarity: 1
Word: Katie, Similarity: 1
Word: Brewster, Similarity: 1
Word: five, Similarity: 0.421413391828537
Word: year, Similarity: 0.4226069450378418
Word: Harrison, Similarity: 1
Word: Farrell, Similarity: 1
Word: had, Similarity: 0.4482886791229248
Word: but, Similarity: 0.5270360112190247
Word: only, Similarity: 0.47278985381126404
Word: later, Similarity: 0.44172444

In [41]:
article_lst = list(similar_words.keys())

In [46]:
len(article_lst)

131

In [44]:

paragraph = ' '.join(article_lst)

print(paragraph)


By . Harriet Arkell PUBLISHED 06:42 EST July UPDATED 08:26 A suffered burns when his leg Staff Kids Academy Leeds Katie Brewster five year Harrison Farrell had but only later did Marketing Miss immediately withdrew was failing month Five serious after him Holt Park The admitted County Court    put But been left thigh following incident last When I got just be side might home went straight doctor out could away he badly even Eventually one hit doctors In so pain injured infection days hospital took wound A&E syringed Doctors sustained second right discomfort Once removed She already needed if this injury no going back Now over He six months getting Thankfully affected being before Chris Baxendale This shock upset severe danger yet failed Ofsted We They further action Ms Everyone


In [47]:
summary =print(summarizer(paragraph, max_length=130, min_length=30, do_sample=False))

[{'summary_text': 'Katie Brewster five year Harrison Farrell had but only later did Marketing Miss immediately withdrew was failing month Five serious after him Holt Park The admitted County Court \xa0  put But been left thigh following incident last.'}]


In [51]:
similarities_2 = calculate_similarity(glove_embeddings, word_embeddings, target_word_2)

In [52]:
similar_words = {}
for word, similarity in similarities_2.items():
    if similarity >= threshold_similarity or word not in glove_embeddings:
        similar_words[word] = similarity


for word, similarity in similar_words.items():
    print(f"Word: {word}, Similarity: {similarity}")

Word: By, Similarity: 1
Word: ., Similarity: 0.5372235774993896
Word: Harriet, Similarity: 1
Word: Arkell, Similarity: 1
Word: PUBLISHED, Similarity: 1
Word: 06:42, Similarity: 1
Word: EST, Similarity: 1
Word: ,, Similarity: 0.5112456679344177
Word: July, Similarity: 1
Word: UPDATED, Similarity: 1
Word: 08:26, Similarity: 1
Word: A, Similarity: 1
Word: whose, Similarity: 0.5304467678070068
Word: when, Similarity: 0.4593813121318817
Word: staff, Similarity: 0.4255468547344208
Word: on, Similarity: 0.5490436553955078
Word: his, Similarity: 0.48923468589782715
Word: has, Similarity: 0.5329458117485046
Word: sued, Similarity: 0.4782717525959015
Word: the, Similarity: 0.5819333791732788
Word: Staff, Similarity: 1
Word: Kids, Similarity: 1
Word: Academy, Similarity: 1
Word: in, Similarity: 0.5104970335960388
Word: Leeds, Similarity: 1
Word: initially, Similarity: 0.460269033908844
Word: told, Similarity: 0.4334658086299896
Word: Katie, Similarity: 1
Word: Brewster, Similarity: 1
Word: that, 

In [53]:
article_lst_2 = list(similar_words.keys())

In [54]:
paragraph_2 = ' '.join(article_lst_2)

In [55]:
summary_2 = print(summarizer(paragraph_2, max_length=130, min_length=30, do_sample=False))

[{'summary_text': "Katie Brewster's five-year-old son Harrison Farrell was taken to A&E. Staff Kids Academy in Leeds initially told her he had his own but only later did an executive tell her. She was then bill for failing to give month 's notice and sued the school."}]


In [56]:
similarities_3 = calculate_similarity(glove_embeddings, word_embeddings, target_word_3)

In [57]:
similar_words = {}
for word, similarity in similarities_3.items():
    if similarity >= threshold_similarity or word not in glove_embeddings:
        similar_words[word] = similarity


for word, similarity in similar_words.items():
    print(f"Word: {word}, Similarity: {similarity}")

Word: By, Similarity: 1
Word: Harriet, Similarity: 1
Word: Arkell, Similarity: 1
Word: PUBLISHED, Similarity: 1
Word: 06:42, Similarity: 1
Word: EST, Similarity: 1
Word: July, Similarity: 1
Word: UPDATED, Similarity: 1
Word: 08:26, Similarity: 1
Word: A, Similarity: 1
Word: Staff, Similarity: 1
Word: Kids, Similarity: 1
Word: Academy, Similarity: 1
Word: Leeds, Similarity: 1
Word: Katie, Similarity: 1
Word: Brewster, Similarity: 1
Word: Harrison, Similarity: 1
Word: Farrell, Similarity: 1
Word: Marketing, Similarity: 1
Word: Miss, Similarity: 1
Word: Five, Similarity: 1
Word: Holt, Similarity: 1
Word: Park, Similarity: 1
Word: The, Similarity: 1
Word: County, Similarity: 1
Word: Court, Similarity: 1
Word:   , Similarity: 1
Word: But, Similarity: 1
Word: When, Similarity: 1
Word: I, Similarity: 1
Word: know, Similarity: 0.43034008145332336
Word: what, Similarity: 0.4508129358291626
Word: happened, Similarity: 0.4497954845428467
Word: really, Similarity: 0.4812863767147064
Word: even, Si

In [58]:
article_lst_3 = list(similar_words.keys())

In [59]:
paragraph_3 = ' '.join(article_lst_3)

In [60]:
summary_3 = print(summarizer(paragraph_3, max_length=130, min_length=30, do_sample=False))

Your max_length is set to 130, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)


[{'summary_text': 'Chris Baxendale was a pupil at a school in Leeds. He was sent to A&E for syringes and was left in a lot of pain. Now he has learned to cope with the pain of the syringing.'}]
