In [1]:
import torch
import random
import json
import os

import numpy as np
import pandas as pd
import torch.nn.functional as F

from random import randrange
# from tqdm.notebook import tqdm_no
from IPython.display import display, Markdown
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer


In [2]:
# Change to your corresponding key to download dataset from Kaggle
# https://www.kaggle.com/datasets/Cornell-University/arxiv

with open('./kaggle.json') as f:
    file = json.load(f)

os.environ["KAGGLE_KEY"] = file['key']
os.environ["KAGGLE_USERNAME"] = 'jeploretizo'

In [3]:
!kaggle datasets download -d Cornell-University/arxiv

arxiv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
! unzip "arxiv.zip"

Archive:  arxiv.zip
  inflating: arxiv-metadata-oai-snapshot.json  


In [4]:

MAX_SEQUENCE_LENGTH = 256
FILE_PATH ='./arxiv-metadata-oai-snapshot.json'
device = torch.device("mps")

In [5]:
def get_data():
    with open(FILE_PATH) as f:
        for line in f:
            yield line

In [6]:
year_limit = 2023

dataframe = {
    'id': [],
    'title': [],
    'year': [],
    'abstract': []

}

data = get_data()
for i, paper in enumerate(data):
    paper = json.loads(paper)
    try:
        date = int(paper['update_date'].split('-')[0])
        if date > year_limit:
            dataframe['title'].append(paper['title'])
            dataframe['year'].append(date)
            dataframe['abstract'].append(paper['abstract'])
            dataframe['id'].append(paper['id'])
    except: pass

In [7]:
main_df = pd.DataFrame(dataframe)

# Limit to first 500 for training purposes
df = main_df[:500]

In [25]:
import torch
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [26]:
class PapersDataset(Dataset):
    def __init__(self, tokenizer, titles, abstracts, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.attn_masks = []
        for title, abstract in zip(titles, abstracts):
            # Concatenate title and abstract with a delimiter
            text = f"Title: {title} Abstract: {abstract}"
            encodings_dict = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
            self.inputs.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.attn_masks[idx]

titles = df['title'].values
abstracts = df['abstract'].values

dataset = PapersDataset(tokenizer, titles, abstracts)
dataloader = DataLoader(dataset, batch_size=3, shuffle=True)


In [27]:

model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [1]:


for epoch in range(50): 
    model.train()
    for inputs, masks in dataloader:
        inputs = inputs.to(device)
        masks = masks.to(device)
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} finished")

model.save_pretrained('./my_fine_tuned_model')
tokenizer.save_pretrained('./my_fine_tuned_model')



In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('./my_fine_tuned_model')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('./my_fine_tuned_model')

In [13]:
from transformers import pipeline

generator = pipeline('text-generation', model=model, tokenizer=tokenizer, config={'max_length':512})

In [18]:


def generate_recommendation(model, tokenizer, query, max_length=512):
  
    inputs = tokenizer.encode("Title: " + query + " Abstract:", return_tensors="pt")

    output_sequences = model.generate(
        input_ids=inputs,
        max_length=max_length,
        temperature=1.0,  # Adjust temperature for creativity
        top_k=50,         # Adjust top_k for diversity
        top_p=0.95,       # Adjust top_p for diversity
        repetition_penalty=1.2,  # Adjust repetition penalty to discourage repetition
        do_sample=True,   # Enable sampling to generate diverse recommendations
        num_return_sequences=5  # Number of recommendations to generate
    )
    
    # Decode the generated sequences to text
    recommendations = [tokenizer.decode(generated_sequence, skip_special_tokens=True) for generated_sequence in output_sequences]
    
    return recommendations



In [19]:
query = "machine learning"  # Example query
recommendations = generate_recommendation(model, tokenizer, query)

for rec in recommendations:
    print(rec)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Title: machine learning Abstract:  A detailed exposition of neural networks is presented. Each network constructs a training set, and
learns new representations such that any associated memory pool accumulates. The training sets are organized into layers or filters
whose primary aim lies within the network layer itself; in this paper we concentrate on those modules whose function it (i) Is to Understand Neural Networks by First-Generation
Deep Learning Techniques (DLG); (ii), (iii)-(iv)/(v)+ denotes an autonomous process within them which automatically executes further steps when
necessary ("incomplete" teaching methods like image labeling may be more appropriate). It then transforms the residual data from each filter into its own object so as never again does every single algorithm
backwardly miss information from all outputs created through these previously differentiated models take advantage
Title: machine learning Abstract:  The development of algorithms for classification and ana

In [20]:
def extract_titles_from_recommendations(recommendations):
    titles = []
    for rec in recommendations:
        # Split the recommendation into title and abstract parts
        parts = rec.split(" Abstract:")
        title_part = parts[0]  # This part contains "Title: [Generated Title]"
        
        # Further split to isolate the title
        title = title_part.replace("Title: ", "").strip()
        titles.append(title)
    
    return titles

In [21]:
recommendations = generate_recommendation(model, tokenizer, "machine learning")
titles = extract_titles_from_recommendations(recommendations)

for title in titles:
    print(title)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


machine learning
machine learning
machine learning
machine learning
machine learning


In [15]:
query = "Causal Inference"  # Your query here
recommendations = generate_recommendations(query)

for idx, rec in enumerate(recommendations, 1):
    print(f"Recommendation {idx}: {rec}\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Recommendation 1: Title: Causal Inference Abstract:   Theorem. - Theorem. - Theorem. - Let a family of finite groupoids be generated
of finite groupoids in a suitable Bayesian space, e.g. from the
morphism of the groupoid $A(k)$ to the holomorphic integer $x^q(k)$, and every
for each subgroupoid $q$ of $k the algorithm returns the results
in a straight forward, Bayesian fashion.


Recommendation 2: Title: Causal Inference Abstract:   In this article, we prove that there are three
directions to the convex graph from positive probability to Gaussian probability: a) the
decomposition of a convex graph by its points of interest; b) the identification
of a single point by its coefficients of the convex graph. In this spirit,
we give a general proof that the general probability formula ${ \cal{N}
\cap \limits_{{{\omega}(x_{i+1}\vee i+1})}{I}{{-}$ is correct for all
points on the convex graph if and only if their products (the sums
of sums) of reduced by the points of interest) are derived. A

In [40]:
recommendations

["Title: Causal Inference Abstract: The Feature Selection Library (FSLib) signifies a notable progression in\nmachine learning and data mining for MATLAB users, emphasizing the critical\nrole of Feature Selection (FS) in enhancing model efficiency and effectiveness\nby pinpointing essential features for specific tasks. FSLib's contributions are\ncomprehensive, tackling various FS challenges. It offers a wide array of FS\nalgorithms, including filter, embedded, and wrapper methods, allowing for efficient\nfeature selection. Filter fundamental features are: multi-selectivity, highlight actions,\nresponsive element, and wrapper methods that aggregate key performance indicators. Filter\nmethods prioritize intrinsic feature properties, embedded methods integrate\nselection within the training process, and wrapper methods evaluate features\nbased on model performance, catering to diverse modeling approaches. FSLib also\naddresses the curse of dimensionality by facilitating the selection of r