<div class="alert alert-block alert-warning">
<b>READ BEFORE PROCEEDING! </b> <br>Please download the model folder through this <a href="https://drive.google.com/drive/folders/1Rv618qDP-zg94laK8gIf_XFbqARNMmQI?usp=share_link">link</a>
</div>




In [None]:
import torch
import random
import json
import os

from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import torch.nn.functional as F

from random import randrange
# from tqdm.notebook import tqdm_no
from IPython.display import display, Markdown
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer
import torch

# add the following dependencies for streamlit backend
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader


In [None]:
# Instantiate a kaggle connection for the open database
with open('./kaggle.json') as f:
    file = json.load(f)

os.environ["KAGGLE_KEY"] = file['key']
os.environ["KAGGLE_USERNAME"] = 'jeploretizo'

In [3]:
!kaggle datasets download -d zzydipper/citation-v1

citation-v1.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
! unzip "./citation-v1.zip"

Archive:  ./citation-v1.zip
replace papers_test.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [None]:

MAX_SEQUENCE_LENGTH = 256
FILE_PATH ='./papers_test.json'
device = torch.device("mps")

In [11]:
def get_data():
    with open(FILE_PATH) as f:
        for line in f:
            yield line

In [12]:
year_limit = 1900

dataframe = {
    'id': [],
    'title': [],
    'year': [],
    'abstract': []

}

data = get_data()
for i, paper in enumerate(data):
    paper = json.loads(paper)
    try:
        date = paper['year']
        if  date > year_limit:
            # print('here')
            dataframe['title'].append(paper['title'])
            dataframe['year'].append(date)
            dataframe['abstract'].append(paper['abstract'])
            dataframe['id'].append(paper['id'])
    except: pass

In [13]:
main_df = pd.DataFrame(dataframe)


In [25]:

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [26]:
class PapersDataset(Dataset):
    def __init__(self, tokenizer, titles, abstracts, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.attn_masks = []
        for title, abstract in zip(titles, abstracts):
            # Concatenate title and abstract with a delimiter
            text = f"Title: {title} Abstract: {abstract}"
            encodings_dict = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
            self.inputs.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.attn_masks[idx]

titles = df['title'].values
abstracts = df['abstract'].values

dataset = PapersDataset(tokenizer, titles, abstracts)
dataloader = DataLoader(dataset, batch_size=3, shuffle=True)


In [27]:

model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [1]:


for epoch in range(50): 
    model.train()
    for inputs, masks in dataloader:
        inputs = inputs.to(device)
        masks = masks.to(device)
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} finished")

model.save_pretrained('./fine_tuned_gpt_model')
tokenizer.save_pretrained('./fine_tuned_gpt_model')



In [14]:
# add on the streamlit backend
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt_model')
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt_model')
model.config.output_hidden_states = True  
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [15]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.hidden_states[-1]
    paper_embedding = torch.mean(last_hidden_state, dim=1)
    return paper_embedding

def calculate_similarity(search_string, abstracts):
    search_embedding = get_embedding(search_string)
    scores = []
    for abstract in abstracts:
        abstract_embedding = get_embedding(abstract)
        similarity_score = torch.cosine_similarity(search_embedding, abstract_embedding, dim=1)
        scores.append(similarity_score.item())
    return scores

In [30]:
test_df =main_df[:50]

In [31]:
test_df['combined'] = test_df['title'] + ' ' + test_df['abstract']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['combined'] = test_df['title'] + ' ' + test_df['abstract']


In [32]:
abstracts = list(test_df['combined'].values)

In [42]:
# connect to front end search bar
search_string = "supply chain management"

In [43]:
# function to get the embedding
search_string_embedding = get_embedding(search_string)

In [44]:
scores = calculate_similarity(search_string, abstracts)

In [45]:
test_df['similarity_score'] = scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['similarity_score'] = scores


In [46]:
test_df.sort_values(by='similarity_score', ascending=False)

Unnamed: 0,id,title,year,abstract,combined,similarity_score
30,30,Modeling methodology b: distributed simulation...,2006,,Modeling methodology b: distributed simulation...,0.395091
22,22,Multimedia Directory 1997,1997,,Multimedia Directory 1997,0.386431
43,43,At Ease With Performa,1994,,At Ease With Performa,0.349378
2,2,Performance engineering in industry: current p...,2007,This panel session discusses performance engin...,Performance engineering in industry: current p...,0.326268
41,41,Microprogramming for the hardware engineer,1976,With the advent of the Am2901 four-bit micropr...,Microprogramming for the hardware engineer Wit...,0.32333
8,8,Type Graphics and MacIntosh,1987,,Type Graphics and MacIntosh,0.315487
37,37,A New Quadtree Decomposition Reconstruction Me...,1996,,A New Quadtree Decomposition Reconstruction Me...,0.307836
24,24,On product covering in 3-tier supply chain mod...,2006,The field of supply chain management has been ...,On product covering in 3-tier supply chain mod...,0.301731
3,3,"Dude, You Can Do It! How to Build a Sweeet PC",2005,Whether you're frustrated with current PC offe...,"Dude, You Can Do It! How to Build a Sweeet PC ...",0.287752
12,12,"Webbots, Spiders, and Screen Scrapers",2007,The Internet is bigger and better than what a ...,"Webbots, Spiders, and Screen Scrapers The Inte...",0.261013
