In [225]:
import wikipediaapi
from dotenv import load_dotenv
import os
# Wikipedia article setup
load_dotenv()

def fetchWiki(term, wiki_lang="en"):
    wiki = wikipediaapi.Wikipedia(user_agent=os.getenv('wikipedia_useragent'), language=wiki_lang)
    page = wiki.page(term)
    if page.exists():
        return page.summary, page.fullurl, page.text
    else:
        print(f"{term} could not be found")
        return "", ""

In [246]:
%%time 
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

df = pd.DataFrame(columns=['title', 'summary', 'url', 'embedding'])

#Lajavaness/bilingual-embedding-base, mixedbread-ai/mxbai-embed-large-v1, nomic-ai/nomic-embed-text-v1
embd = SentenceTransformer(model_name_or_path='Lajavaness/bilingual-embedding-base', trust_remote_code=True, device='cpu')
#900 rows takes ~! minute

with open('input.txt', 'r') as f:
    for line in f:
        line = line.strip()
        summary, fullurl, text = fetchWiki(line)
        embedding = embd.encode(line)
        embedding = embd.encode(summary)
        df.loc[len(df)] = [line, summary, fullurl, embedding]
        lines = text.split("\n")
        for l in lines:
            #make sure the passage is more than 5 sentences (arbitrarily chosen, but a good estimate for paragraphs vs non-paragraphs)
            if len(l.split(".")) < 5: continue
            e = embd.encode(l)
            df.loc[len(df)] = [line, l, fullurl, e]
            
df.to_csv("output.csv")
df

CPU times: user 2min 32s, sys: 14.3 s, total: 2min 46s
Wall time: 50.2 s


Unnamed: 0,title,summary,url,embedding
0,United States,"The United States of America (USA), also known...",https://en.wikipedia.org/wiki/United_States,"[-0.055632256, 0.051498998, 0.02838714, -0.024..."
1,United States,"The United States of America (USA), also known...",https://en.wikipedia.org/wiki/United_States,"[-0.04639075, 0.037411842, 0.025962226, -0.043..."
2,United States,Paleo-Indians migrated to North America across...,https://en.wikipedia.org/wiki/United_States,"[-0.054831214, 0.06579868, 0.014245398, -0.018..."
3,United States,The U.S. national government is a presidential...,https://en.wikipedia.org/wiki/United_States,"[-0.03631137, 0.028581496, -0.014794229, 0.002..."
4,United States,"One of the world's most developed countries, t...",https://en.wikipedia.org/wiki/United_States,"[-0.045689866, 0.044503246, 0.042601593, -0.00..."
...,...,...,...,...
661,France,Apart from its strong and innovative film trad...,https://en.wikipedia.org/wiki/France,"[0.00053943426, -0.051579565, -0.033470318, -0..."
662,France,The most influential news magazines are the le...,https://en.wikipedia.org/wiki/France,"[0.10223055, 0.04545464, -0.011219817, 0.06835..."
663,France,Different regions have different styles. In th...,https://en.wikipedia.org/wiki/France,"[0.020078026, 0.012435243, -0.011439593, 0.044..."
664,France,French cuisine is also regarded as a key eleme...,https://en.wikipedia.org/wiki/France,"[0.04084742, 0.022281775, 0.015200411, 0.07634..."


In [247]:
import re
import pandas as pd
import numpy as np
import torch

paragraph_and_embedding_df = pd.read_csv('output.csv')
# clean data because csv file stores it weirdly
paragraph_and_embedding_df['embedding'] = paragraph_and_embedding_df['embedding'].apply(lambda x: np.array(x.strip("[]").split(), dtype=np.float32))
embeddings = torch.tensor(np.stack(paragraph_and_embedding_df['embedding'].tolist(), axis=0))
paragraph_and_embedding_df

Unnamed: 0.1,Unnamed: 0,title,summary,url,embedding
0,0,United States,"The United States of America (USA), also known...",https://en.wikipedia.org/wiki/United_States,"[-0.055632256, 0.051498998, 0.02838714, -0.024..."
1,1,United States,"The United States of America (USA), also known...",https://en.wikipedia.org/wiki/United_States,"[-0.04639075, 0.037411842, 0.025962226, -0.043..."
2,2,United States,Paleo-Indians migrated to North America across...,https://en.wikipedia.org/wiki/United_States,"[-0.054831214, 0.06579868, 0.014245398, -0.018..."
3,3,United States,The U.S. national government is a presidential...,https://en.wikipedia.org/wiki/United_States,"[-0.03631137, 0.028581496, -0.014794229, 0.002..."
4,4,United States,"One of the world's most developed countries, t...",https://en.wikipedia.org/wiki/United_States,"[-0.04568987, 0.04450325, 0.04260159, -0.00954..."
...,...,...,...,...,...
661,661,France,Apart from its strong and innovative film trad...,https://en.wikipedia.org/wiki/France,"[0.00053943426, -0.051579565, -0.033470318, -0..."
662,662,France,The most influential news magazines are the le...,https://en.wikipedia.org/wiki/France,"[0.10223055, 0.04545464, -0.011219817, 0.06835..."
663,663,France,Different regions have different styles. In th...,https://en.wikipedia.org/wiki/France,"[0.020078026, 0.012435243, -0.011439593, 0.044..."
664,664,France,French cuisine is also regarded as a key eleme...,https://en.wikipedia.org/wiki/France,"[0.04084742, 0.022281775, 0.015200411, 0.07634..."


In [265]:
%%time
from sentence_transformers import SentenceTransformer

embd = SentenceTransformer(model_name_or_path='Lajavaness/bilingual-embedding-base', trust_remote_code=True, device='cpu')

query = "Germany"
query_embd = embd.encode(query, convert_to_tensor=True)

'''
def cosine_similarity(v1, v2):
    dot = torch.dot(v1, v2)
    
    norm_v1 = torch.sqrt(torch.sum(v1**2))
    norm_v2 = torch.sqrt(torch.sum(v2**2))
    
    return dot / (norm_v1 * norm_v2)

def cosine_score(a, b):
    return torch.tensor([cosine_similarity(a, x) for x in b])
'''

def cosine_score(a, b):
    norm_a = torch.norm(a) 
    norm_b = torch.norm(b, dim=1)
    dot_product = torch.matmul(b, a) 
    return dot_product / (norm_b * norm_a)

cos_score = cosine_score(a=query_embd, b=embeddings)


top5 = torch.topk(cos_score, k=5)
top5

CPU times: user 641 ms, sys: 137 ms, total: 778 ms
Wall time: 2.81 s


torch.return_types.topk(
values=tensor([0.3509, 0.3357, 0.3061, 0.2971, 0.2970]),
indices=tensor([564, 488, 594, 611,  48]))

In [267]:
paragraph_and_embedding_df.iloc[564]['summary']

"In 2010, 82.5% of the UK population were Internet users, the highest proportion among the 20 countries with the largest total number of users in that year. The British video game industry is the largest in Europe, and, since 2022, the UK has the largest video game market in Europe by sales, overtaking Germany. It is the world's third-largest producer of video games after Japan and the United States."