In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np

In [2]:
description = pd.read_excel('CUB_description.xlsx')

description.head()

Unnamed: 0,Bird_name,Short_Description
0,Black footed Albatross,The Black-footed Albatross is a large seabird ...
1,Laysan Albatross,The Laysan Albatross is a large seabird with a...
2,Sooty Albatross,The Sooty Albatross is a large seabird known f...
3,Groove billed Ani,The Groove-billed Ani bird is a medium-sized b...
4,Crested Auklet,The Crested Auklet is a small seabird with a d...


In [3]:
import gensim
from gensim.models.doc2vec import Word2Vec
import gensim.downloader as api
import nltk
# nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [5]:
wv = api.load('word2vec-google-news-300')

In [6]:
def preprocess_text(text):
    text = text.lower()

    words = gensim.utils.simple_preprocess(text)

    stop_words = gensim.parsing.preprocessing.STOPWORDS
    words = [word for word in words if word not in stop_words]

    return words

In [7]:
def get_sentence_embedding(sentence):
    words = preprocess_text(sentence)

    word_embeddings = [wv[word] for word in words if word in wv]

    sentence_embedding = np.mean(word_embeddings, axis=0)

    return sentence_embedding

In [8]:
def get_paragraph_embedding(paragraph):
    sentences = nltk.sent_tokenize(paragraph)

    sentence_embeddings = [get_sentence_embedding(sentence) for sentence in sentences]

    paragraph_embedding = np.mean(sentence_embeddings, axis=0)

    return paragraph_embedding

In [9]:
paragraph = "This is a sample paragraph. It contains multiple sentences."
paragraph_embedding = get_paragraph_embedding(paragraph)

In [10]:
paragraph_embedding.shape

(300,)

In [11]:
paragraph_embedding

array([-0.03771464, -0.0402832 ,  0.19514975, -0.04826355, -0.02701823,
       -0.05029297,  0.083903  ,  0.04135132,  0.05739339,  0.13986206,
       -0.11649577, -0.05000814,  0.01379395,  0.15749106, -0.07039388,
        0.10304768, -0.0608724 ,  0.12445068, -0.15877278, -0.16764322,
       -0.02596029, -0.00866699, -0.22224934,  0.02697754, -0.02156575,
       -0.09277344, -0.05086263,  0.08154297,  0.02331543,  0.15952554,
       -0.08418783, -0.10685221, -0.04372152,  0.06189982,  0.00406901,
        0.17419434,  0.01103719, -0.01224772, -0.08784994,  0.08756511,
        0.08237712,  0.09376526,  0.01020304,  0.13675945,  0.00199382,
       -0.13242593, -0.03637695,  0.04398092, -0.21626791, -0.06312244,
       -0.07623291, -0.20450847, -0.10894775, -0.0921224 ,  0.01692708,
       -0.03045654,  0.06778971, -0.08264669, -0.08026123, -0.05033366,
        0.14855957,  0.0328776 ,  0.00714111, -0.06003825,  0.06092326,
       -0.04003906, -0.03932698,  0.00223796, -0.02351888, -0.05

In [12]:
att_mat = sio.loadmat('att_splits.mat')

In [13]:
all_cls = att_mat['allclasses_names']

all_cls

array([[array(['002.Laysan_Albatross'], dtype='<U20')],
       [array(['003.Sooty_Albatross'], dtype='<U19')],
       [array(['005.Crested_Auklet'], dtype='<U18')],
       [array(['007.Parakeet_Auklet'], dtype='<U19')],
       [array(['010.Red_winged_Blackbird'], dtype='<U24')],
       [array(['011.Rusty_Blackbird'], dtype='<U19')],
       [array(['012.Yellow_headed_Blackbird'], dtype='<U27')],
       [array(['013.Bobolink'], dtype='<U12')],
       [array(['015.Lazuli_Bunting'], dtype='<U18')],
       [array(['016.Painted_Bunting'], dtype='<U19')],
       [array(['017.Cardinal'], dtype='<U12')],
       [array(['018.Spotted_Catbird'], dtype='<U19')],
       [array(['019.Gray_Catbird'], dtype='<U16')],
       [array(['020.Yellow_breasted_Chat'], dtype='<U24')],
       [array(['021.Eastern_Towhee'], dtype='<U18')],
       [array(['022.Chuck_will_Widow'], dtype='<U20')],
       [array(['024.Red_faced_Cormorant'], dtype='<U23')],
       [array(['025.Pelagic_Cormorant'], dtype='<U21')],
    

In [14]:
all_cls_org = [x[0][0] for x in all_cls]

all_cls_org

['002.Laysan_Albatross',
 '003.Sooty_Albatross',
 '005.Crested_Auklet',
 '007.Parakeet_Auklet',
 '010.Red_winged_Blackbird',
 '011.Rusty_Blackbird',
 '012.Yellow_headed_Blackbird',
 '013.Bobolink',
 '015.Lazuli_Bunting',
 '016.Painted_Bunting',
 '017.Cardinal',
 '018.Spotted_Catbird',
 '019.Gray_Catbird',
 '020.Yellow_breasted_Chat',
 '021.Eastern_Towhee',
 '022.Chuck_will_Widow',
 '024.Red_faced_Cormorant',
 '025.Pelagic_Cormorant',
 '026.Bronzed_Cowbird',
 '027.Shiny_Cowbird',
 '028.Brown_Creeper',
 '030.Fish_Crow',
 '032.Mangrove_Cuckoo',
 '039.Least_Flycatcher',
 '040.Olive_sided_Flycatcher',
 '041.Scissor_tailed_Flycatcher',
 '042.Vermilion_Flycatcher',
 '044.Frigatebird',
 '045.Northern_Fulmar',
 '046.Gadwall',
 '047.American_Goldfinch',
 '048.European_Goldfinch',
 '050.Eared_Grebe',
 '052.Pied_billed_Grebe',
 '054.Blue_Grosbeak',
 '055.Evening_Grosbeak',
 '056.Pine_Grosbeak',
 '057.Rose_breasted_Grosbeak',
 '058.Pigeon_Guillemot',
 '059.California_Gull',
 '060.Glaucous_winged_Gu

In [15]:
for i, c in enumerate(all_cls_org):
    all_cls_org[i] = c.split('.')[1]

all_cls_org

['Laysan_Albatross',
 'Sooty_Albatross',
 'Crested_Auklet',
 'Parakeet_Auklet',
 'Red_winged_Blackbird',
 'Rusty_Blackbird',
 'Yellow_headed_Blackbird',
 'Bobolink',
 'Lazuli_Bunting',
 'Painted_Bunting',
 'Cardinal',
 'Spotted_Catbird',
 'Gray_Catbird',
 'Yellow_breasted_Chat',
 'Eastern_Towhee',
 'Chuck_will_Widow',
 'Red_faced_Cormorant',
 'Pelagic_Cormorant',
 'Bronzed_Cowbird',
 'Shiny_Cowbird',
 'Brown_Creeper',
 'Fish_Crow',
 'Mangrove_Cuckoo',
 'Least_Flycatcher',
 'Olive_sided_Flycatcher',
 'Scissor_tailed_Flycatcher',
 'Vermilion_Flycatcher',
 'Frigatebird',
 'Northern_Fulmar',
 'Gadwall',
 'American_Goldfinch',
 'European_Goldfinch',
 'Eared_Grebe',
 'Pied_billed_Grebe',
 'Blue_Grosbeak',
 'Evening_Grosbeak',
 'Pine_Grosbeak',
 'Rose_breasted_Grosbeak',
 'Pigeon_Guillemot',
 'California_Gull',
 'Glaucous_winged_Gull',
 'Heermann_Gull',
 'Herring_Gull',
 'Ivory_Gull',
 'Ring_billed_Gull',
 'Slaty_backed_Gull',
 'Anna_Hummingbird',
 'Ruby_throated_Hummingbird',
 'Rufous_Hummin

In [16]:
for i, c in enumerate(all_cls_org):
    all_cls_org[i] = c.replace('_', ' ')

all_cls_org

['Laysan Albatross',
 'Sooty Albatross',
 'Crested Auklet',
 'Parakeet Auklet',
 'Red winged Blackbird',
 'Rusty Blackbird',
 'Yellow headed Blackbird',
 'Bobolink',
 'Lazuli Bunting',
 'Painted Bunting',
 'Cardinal',
 'Spotted Catbird',
 'Gray Catbird',
 'Yellow breasted Chat',
 'Eastern Towhee',
 'Chuck will Widow',
 'Red faced Cormorant',
 'Pelagic Cormorant',
 'Bronzed Cowbird',
 'Shiny Cowbird',
 'Brown Creeper',
 'Fish Crow',
 'Mangrove Cuckoo',
 'Least Flycatcher',
 'Olive sided Flycatcher',
 'Scissor tailed Flycatcher',
 'Vermilion Flycatcher',
 'Frigatebird',
 'Northern Fulmar',
 'Gadwall',
 'American Goldfinch',
 'European Goldfinch',
 'Eared Grebe',
 'Pied billed Grebe',
 'Blue Grosbeak',
 'Evening Grosbeak',
 'Pine Grosbeak',
 'Rose breasted Grosbeak',
 'Pigeon Guillemot',
 'California Gull',
 'Glaucous winged Gull',
 'Heermann Gull',
 'Herring Gull',
 'Ivory Gull',
 'Ring billed Gull',
 'Slaty backed Gull',
 'Anna Hummingbird',
 'Ruby throated Hummingbird',
 'Rufous Hummin

In [17]:
description = pd.read_excel('CUB_description.xlsx')
description = description.reset_index()
description.head()

Unnamed: 0,index,Bird_name,Short_Description
0,0,Black footed Albatross,The Black-footed Albatross is a large seabird ...
1,1,Laysan Albatross,The Laysan Albatross is a large seabird with a...
2,2,Sooty Albatross,The Sooty Albatross is a large seabird known f...
3,3,Groove billed Ani,The Groove-billed Ani bird is a medium-sized b...
4,4,Crested Auklet,The Crested Auklet is a small seabird with a d...


In [18]:
paragraphs = description['Short_Description'].tolist()

In [19]:
embedding = []

for c in all_cls_org:
    for index, row in description.iterrows():
        if row['Bird_name'] == c:
            print('matched: ' , row['Bird_name'], c)
            text = row['Short_Description']
            embedding.append(get_paragraph_embedding(text))

matched:  Laysan Albatross Laysan Albatross
matched:  Sooty Albatross Sooty Albatross
matched:  Crested Auklet Crested Auklet
matched:  Parakeet Auklet Parakeet Auklet
matched:  Red winged Blackbird Red winged Blackbird
matched:  Rusty Blackbird Rusty Blackbird
matched:  Yellow headed Blackbird Yellow headed Blackbird
matched:  Bobolink Bobolink
matched:  Lazuli Bunting Lazuli Bunting
matched:  Painted Bunting Painted Bunting
matched:  Cardinal Cardinal
matched:  Spotted Catbird Spotted Catbird
matched:  Gray Catbird Gray Catbird
matched:  Yellow breasted Chat Yellow breasted Chat
matched:  Eastern Towhee Eastern Towhee
matched:  Chuck will Widow Chuck will Widow
matched:  Red faced Cormorant Red faced Cormorant
matched:  Pelagic Cormorant Pelagic Cormorant
matched:  Bronzed Cowbird Bronzed Cowbird
matched:  Shiny Cowbird Shiny Cowbird
matched:  Brown Creeper Brown Creeper
matched:  Fish Crow Fish Crow
matched:  Mangrove Cuckoo Mangrove Cuckoo
matched:  Least Flycatcher Least Flycatche

In [20]:
embedding_array = np.vstack(embedding)

In [21]:
embedding_array.shape

(200, 300)

In [22]:
embedding_array

array([[ 0.01282   ,  0.12378289, -0.07555536, ..., -0.00654936,
        -0.01321572,  0.04403346],
       [ 0.05318933,  0.11495602, -0.03587015, ..., -0.02683852,
         0.0127302 ,  0.03931695],
       [ 0.03355895,  0.10614226, -0.05151076, ..., -0.01503086,
         0.01835611,  0.01340824],
       ...,
       [ 0.04193296,  0.10132488, -0.08010884, ..., -0.03031019,
         0.0430311 ,  0.04874269],
       [ 0.01126219,  0.11278132, -0.05304326, ...,  0.00086981,
         0.00733344,  0.03628048],
       [ 0.00263848,  0.13838887, -0.08225323, ..., -0.03458618,
         0.0262956 ,  0.01994618]], dtype=float32)

In [23]:
print("Min:", embedding_array.min(), ", Max:", embedding_array.max())

Min: -0.2551851 , Max: 0.23057702


In [24]:
embedding_array.shape

(200, 300)

In [25]:
print("Min:", embedding_array.min(), ", Max:", embedding_array.max())

Min: -0.2551851 , Max: 0.23057702


In [26]:
att_mat['att'].shape

(312, 200)

In [27]:
att_mat['att'] = embedding_array.T

In [28]:
att_mat['att'].shape

(300, 200)

In [29]:
sio.savemat('w2v_description_cub.mat', att_mat)