In [1]:
import pandas as pd

In [2]:
links = pd.read_csv('train.csv').wiki_url.unique().tolist()[2:]
links[:5]

['http://en.wikipedia.org/wiki/Germany',
 'http://en.wikipedia.org/wiki/United_Kingdom',
 'http://en.wikipedia.org/wiki/Brussels',
 'http://en.wikipedia.org/wiki/European_Commission',
 'http://en.wikipedia.org/wiki/European_Union']

In [3]:
titles = [' '.join(link[29:].split('_')) for link in links]
titles[:5]

['Germany',
 'United Kingdom',
 'Brussels',
 'European Commission',
 'European Union']

In [4]:
redirect = pd.read_csv('enwiki_redirects.tsv', sep='\t', header=None)
redirect.head(5)

Unnamed: 0,0,1
0,!,Exclamation mark
1,! (CONFIG.SYS directive),CONFIG.SYS
2,! (Donnie Vie Album),Donnie Vie
3,! (Donnie Vie album),Donnie Vie
4,! (The Song Formerly Known As),Unit (album)


In [5]:
wiki_des= pd.read_csv('wiki_items.csv')
wiki_des.head()

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
0,1,Universe,totality of space and all contents,Universe
1,2,Earth,third planet from the Sun in the Solar System,Earth
2,3,life,matter capable of extracting energy from the e...,Life
3,4,death,permanent cessation of vital functions,Death
4,5,human,"common name of Homo sapiens, unique extant spe...",Human


In [6]:
title_pd = pd.DataFrame(links, titles).reset_index()
title_pd.columns = ['title', 'link']
title_pd

Unnamed: 0,title,link
0,Germany,http://en.wikipedia.org/wiki/Germany
1,United Kingdom,http://en.wikipedia.org/wiki/United_Kingdom
2,Brussels,http://en.wikipedia.org/wiki/Brussels
3,European Commission,http://en.wikipedia.org/wiki/European_Commission
4,European Union,http://en.wikipedia.org/wiki/European_Union
...,...,...
3985,Tadayuki Okada,http://en.wikipedia.org/wiki/Tadayuki_Okada
3986,Carlos Checa,http://en.wikipedia.org/wiki/Carlos_Checa
3987,Shinichi Ito,http://en.wikipedia.org/wiki/Shinichi_Ito
3988,Bob May (golfer),http://en.wikipedia.org/wiki/Bob_May_(golfer)


In [7]:
merged_wiki = wiki_des.merge(title_pd, left_on='wikipedia_title', right_on='title', how='right')
merged_wiki = merged_wiki.drop(columns=[ 'en_label', 'title'])
merged_wiki['en_description'].fillna(merged_wiki['wikipedia_title'], inplace=True)

merged_wiki

Unnamed: 0,item_id,en_description,wikipedia_title,link
0,183,federal parliamentary republic in central-west...,Germany,http://en.wikipedia.org/wiki/Germany
1,145,constitutional monarchy in Western Europe; for...,United Kingdom,http://en.wikipedia.org/wiki/United_Kingdom
2,240,federal region of Belgium comprising 19 munici...,Brussels,http://en.wikipedia.org/wiki/Brussels
3,8880,"executive branch of the European Union, respon...",European Commission,http://en.wikipedia.org/wiki/European_Commission
4,458,economic and political union of states mostly ...,European Union,http://en.wikipedia.org/wiki/European_Union
...,...,...,...,...
3985,934327,Japanese motorcycle racer,Tadayuki Okada,http://en.wikipedia.org/wiki/Tadayuki_Okada
3986,461580,Motorcycle rider,Carlos Checa,http://en.wikipedia.org/wiki/Carlos_Checa
3987,1334813,Japanese motorcycle racer,Shinichi Ito,http://en.wikipedia.org/wiki/Shinichi_Ito
3988,4933296,professional golfer,Bob May (golfer),http://en.wikipedia.org/wiki/Bob_May_(golfer)


In [8]:
merged_wiki.to_csv('train_wiki.csv', index=False)

In [2]:
merged_wiki = pd.read_csv('train_wiki.csv')

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [4]:
from tqdm.auto import tqdm

In [9]:
batch_size = 16
all_embeddings = []

for i in tqdm(range(0, len(merged_wiki), batch_size)):
    batch_df = merged_wiki['en_description'].iloc[i:i+batch_size].tolist()
    try:
        # Tokenize the batch of sentences
        tokens = tokenizer(batch_df, return_tensors="pt", padding=True, truncation=True)

        # Get the embeddings from the model
        outputs = model(**tokens)

        # Extract the [CLS] token embeddings for the entire batch
        cls_embeddings = outputs.last_hidden_state[:, 0, :].detach()
        all_embeddings.append(cls_embeddings)

    except:
        print(batch_df)

    print(f"Batch {i//batch_size + 1} embeddings:", cls_embeddings.shape)

  0%|          | 0/250 [00:00<?, ?it/s]

Batch 1 embeddings: torch.Size([16, 768])
Batch 2 embeddings: torch.Size([16, 768])
Batch 3 embeddings: torch.Size([16, 768])
Batch 4 embeddings: torch.Size([16, 768])
Batch 5 embeddings: torch.Size([16, 768])
Batch 6 embeddings: torch.Size([16, 768])
Batch 7 embeddings: torch.Size([16, 768])
Batch 8 embeddings: torch.Size([16, 768])
Batch 9 embeddings: torch.Size([16, 768])
Batch 10 embeddings: torch.Size([16, 768])
Batch 11 embeddings: torch.Size([16, 768])
Batch 12 embeddings: torch.Size([16, 768])
Batch 13 embeddings: torch.Size([16, 768])
Batch 14 embeddings: torch.Size([16, 768])
Batch 15 embeddings: torch.Size([16, 768])
Batch 16 embeddings: torch.Size([16, 768])
Batch 17 embeddings: torch.Size([16, 768])
Batch 18 embeddings: torch.Size([16, 768])
Batch 19 embeddings: torch.Size([16, 768])
Batch 20 embeddings: torch.Size([16, 768])
Batch 21 embeddings: torch.Size([16, 768])
Batch 22 embeddings: torch.Size([16, 768])
Batch 23 embeddings: torch.Size([16, 768])
Batch 24 embeddings:

In [10]:
all_embeddings_tensor = torch.concatenate(all_embeddings, axis=0)
print(all_embeddings_tensor.shape)
# Save the tensor to a .pt file
torch.save(all_embeddings_tensor, "embeddings.pt")

torch.Size([3990, 768])
