In [1]:
import pandas as pd
import numpy as np
import torch

import fasttext

from huggingface_hub import hf_hub_download

import time

from tqdm import tqdm

import faiss

from datasets import load_dataset

from pprint import pprint

import logging
logging.basicConfig(level=logging.INFO)

len_dataset = 2326839

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda', index=0)

In [2]:
model_path = hf_hub_download(repo_id="facebook/fasttext-et-vectors", filename="model.bin")

In [3]:
dataset = load_dataset('json', data_files='dataset/arxiv_data.json', split='train', streaming=True)

In [4]:
model = fasttext.load_model(model_path)



In [5]:
def save_to_disk(data, filename):
    np.savez(filename, data)

In [6]:
len(model.words)

2000000

In [7]:
t = next(iter(dataset))

In [8]:
pprint(t['abstract'])

('  A fully differential calculation in perturbative quantum chromodynamics '
 'is\n'
 'presented for the production of massive photon pairs at hadron colliders. '
 'All\n'
 'next-to-leading order perturbative contributions from quark-antiquark,\n'
 'gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\n'
 'all-orders resummation of initial-state gluon radiation valid at\n'
 'next-to-next-to-leading logarithmic accuracy. The region of phase space is\n'
 'specified in which the calculation is most reliable. Good agreement is\n'
 'demonstrated with data from the Fermilab Tevatron, and predictions are made '
 'for\n'
 'more detailed tests with CDF and DO data. Predictions are shown for\n'
 'distributions of diphoton pairs produced at the energy of the Large Hadron\n'
 'Collider (LHC). Distributions of the diphoton pairs from the decay of a '
 'Higgs\n'
 'boson are contrasted with those produced from QCD processes at the LHC, '
 'showing\n'
 'that enhanced sensitivity t

In [9]:
t = t['abstract'].split('.')

In [10]:
len(t)

7

In [11]:
t = [i.strip('\n') for i in t]
t = [i.replace('\n', ' ') for i in t]
t

['  A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders',
 ' All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy',
 ' The region of phase space is specified in which the calculation is most reliable',
 ' Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data',
 ' Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC)',
 ' Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of eve

In [12]:
def process_abstract(dset):
    for i in dset:
        abstract = i['abstract'].split('.')
        abstract = [i.strip('\n') for i in abstract]
        abstract = [i.replace('\n', ' ') for i in abstract]
        yield abstract

processed_abstracts = process_abstract(dataset)

In [13]:
pprint(next(iter(processed_abstracts)))

['  A fully differential calculation in perturbative quantum chromodynamics is '
 'presented for the production of massive photon pairs at hadron colliders',
 ' All next-to-leading order perturbative contributions from quark-antiquark, '
 'gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as '
 'all-orders resummation of initial-state gluon radiation valid at '
 'next-to-next-to-leading logarithmic accuracy',
 ' The region of phase space is specified in which the calculation is most '
 'reliable',
 ' Good agreement is demonstrated with data from the Fermilab Tevatron, and '
 'predictions are made for more detailed tests with CDF and DO data',
 ' Predictions are shown for distributions of diphoton pairs produced at the '
 'energy of the Large Hadron Collider (LHC)',
 ' Distributions of the diphoton pairs from the decay of a Higgs boson are '
 'contrasted with those produced from QCD processes at the LHC, showing that '
 'enhanced sensitivity to the signal can be obta

In [14]:
model.get_sentence_vector(next(iter(processed_abstracts))[0])

array([-1.49317337e-02,  2.72806939e-02,  3.60777117e-02,  1.95817463e-02,
        3.15512195e-02, -4.58300533e-03, -5.16738929e-03,  6.39291573e-03,
        3.39215510e-02, -2.49712802e-02, -6.32433407e-03,  1.26604282e-03,
       -1.94225423e-02, -1.96013395e-02, -9.65498481e-03, -4.62039281e-03,
        2.14990508e-02, -7.99252018e-02,  2.58581601e-02,  6.83950335e-02,
        1.41811287e-02,  9.81273782e-03, -9.34818294e-03,  2.98955720e-02,
        6.07587062e-02, -2.72658411e-02,  9.87942051e-03,  3.92780639e-02,
       -2.63100788e-02,  3.10591748e-03, -1.67004857e-02,  8.72719660e-03,
        4.78854217e-02,  6.86550327e-03,  3.29383388e-02,  5.99094713e-03,
       -2.10483242e-02, -6.47103554e-03,  1.47754075e-02,  4.51184660e-02,
        2.03631520e-02, -4.08341410e-03, -2.74400935e-02, -3.17543633e-02,
        3.84735540e-02,  5.99512719e-02, -4.81584482e-02, -3.46491821e-02,
       -7.74970045e-03, -1.24374246e-02,  1.69636384e-02, -1.84025541e-02,
        1.17025487e-01,  

In [15]:
%%time
embeddings = []
local_chunk = []

it = 0
for i in tqdm(processed_abstracts): # abstract
    for j in i: # sentence
        local_chunk.append(model.get_sentence_vector(j))
    embeddings.append(np.array(local_chunk).mean(axis=0))
    local_chunk = []
    it += 1
        
    if it % 128000 == 0:
        embeddings = np.array(embeddings)
        save_to_disk(embeddings, f"fasttext_embeddings/embeddings_{it}.npz")
        print(f"Saved embeddings_{it}.npz")
        embeddings = embeddings.tolist()
        embeddings = []

if len(embeddings) > 0:
    embeddings = np.array(embeddings)
    save_to_disk(embeddings, f"fasttext_embeddings/embeddings_{it}.npz")
    embeddings = embeddings.tolist()
    embeddings = []
        
    
    

127832it [00:39, 3280.14it/s]

Saved embeddings_128000.npz


255728it [01:22, 3169.09it/s]

Saved embeddings_256000.npz


383738it [02:05, 3018.99it/s]

Saved embeddings_384000.npz


511977it [02:51, 2879.26it/s]

Saved embeddings_512000.npz


639893it [03:37, 2899.12it/s]

Saved embeddings_640000.npz


767895it [04:24, 2770.01it/s]

Saved embeddings_768000.npz


895965it [05:12, 2730.31it/s]

Saved embeddings_896000.npz


1023862it [06:02, 2752.55it/s]

Saved embeddings_1024000.npz


1151979it [06:52, 2721.02it/s]

Saved embeddings_1152000.npz


1279900it [07:44, 2618.56it/s]

Saved embeddings_1280000.npz


1407769it [08:36, 2570.25it/s]

Saved embeddings_1408000.npz


1535824it [09:29, 2451.81it/s]

Saved embeddings_1536000.npz


1663874it [10:21, 2435.98it/s]

Saved embeddings_1664000.npz


1791813it [11:15, 2463.71it/s]

Saved embeddings_1792000.npz


1919936it [12:09, 2598.88it/s]

Saved embeddings_1920000.npz


2047949it [12:55, 3264.34it/s]

Saved embeddings_2048000.npz


2175773it [13:30, 3981.10it/s]

Saved embeddings_2176000.npz


2303799it [14:02, 3505.47it/s]

Saved embeddings_2304000.npz


2326837it [14:09, 2737.79it/s]


CPU times: total: 2min 57s
Wall time: 14min 10s


In [22]:
def load_all_embeddings():
    e_list = []
    for i in range(1, 19):
        embeddings = np.load(f'fasttext_embeddings/embeddings_{i}000.npz')['arr_0'].reshape(-1, 300)
        e_list.append(embeddings)
    
    e = np.concatenate(e_list, axis=0)
    return e

embeddings_array = load_all_embeddings()

In [23]:
embeddings_array.shape

(2304000, 300)

In [24]:
index = faiss.IndexFlatL2(300)
index.add(embeddings_array)
print(index.ntotal)

2304000


In [25]:
faiss.write_index(index, "Indexes/fasttext_embeddings.index")

In [26]:
loaded_index = faiss.read_index("Indexes/fasttext_embeddings.index")

In [28]:
assert loaded_index.ntotal == 2304000