In [11]:
import numpy as np
import pandas as pd

In [52]:
df = pd.read_csv('data/recent_20k.csv', parse_dates=['update_date'])
df.head()

Unnamed: 0,id,title,comments,categories,abstract,update_date,authors_parsed,new_cat
0,2305.05127,Accelerated gradient descent method for functi...,31 pages,math.OC cs.NA math.NA,We consider problems of minimizing functiona...,2023-05-19,"[['Tanaka', ""Ken'ichiro"", '']]","['math.OC', 'cs.NA', 'math.NA']"
1,2305.10888,"Universal Proof Theory, TACL 2022 Lecture Notes",,math.LO cs.LO,The subject of these short lecture notes is ...,2023-05-19,"[['Iemhoff', 'Rosalie', ''], ['Jalali', 'Rahel...","['math.LO', 'cs.LO']"
2,2305.10849,Extreme ATM skew in a local volatility model w...,,q-fin.MF math.PR,This paper concerns a local volatility model...,2023-05-19,"[['Gairat', 'Alexander', ''], ['Shcherbakov', ...","['q-fin.MF', 'math.PR']"
3,2305.10851,Variational Bihamiltonian Cohomologies and Int...,,nlin.SI math-ph math.MP,For an integrable hierarchy which possesses ...,2023-05-19,"[['Liu', 'Si-Qi', ''], ['Wang', 'Zhe', ''], ['...","['nlin.SI', 'math-ph', 'math.MP']"
4,2305.10852,Q-SHED: Distributed Optimization at the Edge v...,,eess.SY cs.LG cs.MA cs.SY math.OC,Edge networks call for communication efficie...,2023-05-19,"[['Fabbro', 'Nicolò Dal', ''], ['Rossi', 'Mich...","['eess.SY', 'cs.LG', 'cs.MA', 'cs.SY', 'math.OC']"


In [53]:
df.shape

(20000, 8)

In [14]:
from data_utils import clean_data

In [54]:
df['clean_abstract'] = df['abstract'].apply(clean_data)

In [6]:
df['clean_abstract'].head()

0    we consider problems of minimizing functionals...
1    the subject of these short lecture notes is a ...
2    this paper concerns a local volatility model i...
3    for an integrable hierarchy which possesses a ...
4    edge networks call for communication efficient...
Name: clean_abstract, dtype: object

In [7]:
from sentence_transformers import SentenceTransformer

In [55]:
docs = df['clean_abstract'].to_list()[:1000]
docs[0]

'we consider problems of minimizing functionals of probability measures on the euclidean space to propose an accelerated gradient descent algorithm for such problems we consider gradient flow of transport maps that give push forward measures of an initial measure then we propose a deterministic accelerated algorithm by extending nesterovs acceleration technique with momentum this algorithm do not based on the wasserstein geometry furthermore to estimate the convergence rate of the accelerated algorithm we introduce new convexity and smoothness for based on transport maps as a result we can show that the accelerated algorithm converges faster than a normal gradient descent algorithm numerical experiments support this theoretical result'

In [18]:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sentence_model.encode(docs)

Wow, that only took 20 seconds.

In [20]:
embeddings.shape

(1000, 384)

In [24]:
# trying with a larger input
docs_2k = df['clean_abstract'].to_list()[:2000]

In [25]:
embeddings_2k = sentence_model.encode(docs_2k)

In [27]:
# even larger
docs_5k = df['clean_abstract'].to_list()[:5000]
embeddings_5k = sentence_model.encode(docs_5k)

In [30]:
embeddings_5k.shape

(5000, 384)

In [57]:
# okay let's do chunks of 5k at a time

embeddings = [0 for _ in range(4)]

for i in range(4):
    docs = df['clean_abstract'].to_list()[5000*i : 5000*(i+1)]
    embeddings[i] = sentence_model.encode(docs)


In [58]:
embeddings[0]

array([[ 0.00866919, -0.09152839,  0.00381073, ...,  0.01778578,
         0.01220374, -0.05651127],
       [-0.08063693,  0.04923205, -0.01266916, ...,  0.06128307,
         0.086666  ,  0.02369638],
       [-0.05104776,  0.05254225,  0.04349861, ..., -0.04011452,
        -0.03792777,  0.09876525],
       ...,
       [-0.04170634,  0.04043854, -0.02808688, ...,  0.05767613,
        -0.01312666,  0.03856558],
       [-0.06862701, -0.01309697, -0.01534034, ..., -0.03284089,
         0.03243253, -0.02368657],
       [-0.0557962 , -0.02677958, -0.02940826, ...,  0.0046546 ,
        -0.10675794,  0.02323357]], dtype=float32)

In [60]:
for i in range(4):
    print(embeddings[i].shape)

(5000, 384)
(5000, 384)
(5000, 384)
(5000, 384)


In [61]:
all_embed = np.concatenate(embeddings, axis=0)

In [62]:
all_embed.shape

(20000, 384)

In [63]:
embed_df = pd.DataFrame(all_embed)
embed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.008669,-0.091528,0.003811,-0.022832,-0.002555,0.010055,0.044759,-0.046527,0.011216,-0.018647,...,0.024168,-0.024272,-0.017991,-0.092372,-0.05276,0.053505,0.023691,0.017786,0.012204,-0.056511
1,-0.080637,0.049232,-0.012669,-0.040947,0.078689,0.017134,-0.001301,0.077127,0.05378,0.029866,...,-0.00103,0.041122,0.020294,-0.022761,0.003198,-0.020421,0.102561,0.061283,0.086666,0.023696
2,-0.051048,0.052542,0.043499,-0.052117,0.004466,-0.008125,0.021389,0.011464,0.106068,-0.047217,...,-0.038333,0.016619,0.077101,-0.044441,-0.057435,-0.038067,0.044717,-0.040115,-0.037928,0.098765
3,-0.053897,-0.045639,0.111787,-0.06316,0.055485,0.017214,-0.009675,-0.033952,-0.024027,-0.106168,...,-0.012136,0.034463,0.025183,-0.024089,-0.011249,-0.01887,0.056149,0.037615,0.074076,-0.004829
4,-0.006157,0.013721,-0.042213,-0.004302,0.011132,-0.053676,-0.024461,-0.084382,-0.04981,0.019546,...,0.040782,-0.012627,0.022656,-0.079626,0.042765,0.084743,-0.090065,-0.017706,-0.082271,0.001171


In [64]:
embed_df.to_csv('embed_2k_all-MiniLM-L6-v2.csv')