In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
from tqdm import tqdm
import gc
from nltk.tokenize import sent_tokenize

In [None]:
!pip install sentence-transformers

In [None]:
!pip install ktrain

In [None]:
import json
data  = []
with open("../input/arxiv/arxiv-metadata-oai-snapshot.json", 'r') as f:
    for line in f: 
        if len(data) > 200000:
            break
        data.append(json.loads(line))

In [None]:
def replace_all(text):
    for i in [".", "\n"]:
        text = text.replace(i, "")
    return text

In [None]:
df = {'id': [], 'text': []}
for paper in tqdm(data):
    df["id"].append(paper["id"])
    df['text'].append(replace_all(paper['title'] +"." + paper['abstract']))

In [None]:
del data
gc.collect()

In [None]:
df = pd.DataFrame(df, columns=['id', 'text'])
df = df.sample(5000)
df.head(20)

In [None]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)

df.head()

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
model.max_seq_length = 250
#Sentences are encoded by calling model.encode()
embeddings = model.encode(df["text"], show_progress_bar = True, normalize_embeddings = True, device = "cuda", convert_to_numpy = True)

# df.head()

In [None]:
df.head()

In [None]:
len(embeddings[0])

In [None]:
embeds_cleaned = []
for ind, i in tqdm(enumerate(embeddings)):
    em = str(embeddings[ind])
    em = em.replace("\n", "")
    embeds_cleaned.append(em)

df.drop(["text"], axis = 1, inplace = True)
df["embed"] = embeds_cleaned.copy()
del embeds_cleaned
gc.collect()

In [None]:
df.head()

In [None]:
df.to_pickle("./embeddings.pkl")

# Inference

In [None]:
data1 = "The success of machine learning in a broad range of applications has led to an \
ever-growing demand for machine learning systems that can be used off the shelf \
by non-experts. To be effective in practice, such systems need to automatically \
choose a good algorithm and feature preprocessing steps for a new dataset at hand, \
and also set their respective hyperparameters. Recent work has started to tackle this \
automated machine learning (AutoML) problem with the help of efficient Bayesian \
optimization methods. Building on this, we introduce a robust new AutoML system \
based on scikit-learn (using 15 classifiers, 14 feature preprocessing methods, and \
4 data preprocessing methods, giving rise to a structured hypothesis space with \
110 hyperparameters). This system, which we dub AUTO-SKLEARN, improves on \
existing AutoML methods by automatically taking into account past performance \
on similar datasets, and by constructing ensembles from the models evaluated \
during the optimization. Our system won the first phase of the ongoing ChaLearn \
AutoML challenge, and our comprehensive analysis on over 100 diverse datasets \
shows that it substantially outperforms the previous state of the art in AutoML. We \
also demonstrate the performance gains due to each of our contributions and derive \
insights into the effectiveness of the individual components of AUTO-SKLEARN."

data2 = """
While designing machine learning based text analytics applications, often, NLP data scientists manually determine which NLP features to use based upon their knowledge and experience with related problems. This results in increased efforts during feature engineering process and renders automated reuse of features across semantically related applications inherently difficult. In this paper, we argue for standardization in feature specification by outlining structure of a language for specifying NLP features and present an approach for their reuse across applications to increase likelihood of identifying optimal features.
"""

In [None]:
# Two lists of sentences
inp_arr = [data1, data2]

In [None]:
def get_recommendations(user_hist, df_embeddings):
    embeddings_inp = model.encode(user_hist, normalize_embeddings = True, device = "cuda", convert_to_numpy = True)
    embeddings_inp = np.mean(embeddings_inp, axis=0 )
    cosine_scores = util.dot_score(embeddings_inp, df_embeddings)
    cosine_scores = cosine_scores.squeeze()
    final_arr = []
    for i in range(len(cosine_scores)):
        tup = (df["id"][i], cosine_scores[i])
        final_arr.append(tup)
    
    final_arr.sort(key = lambda x: x[1], reverse = True)
    final_arr = final_arr[:20]
    return final_arr

In [None]:
get_recommendations(inp_arr, embeddings)