In [1]:
import json
import requests
import csv
import jsonlines
import sys
import numpy as np
import random
import math
import openai
import re
import pandas as pd
import os
from numpy.linalg import norm
import itertools
import string
import torch
from transformers import BertTokenizer, BertModel
import logging

In [2]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [3]:
openai.api_key =  ''

In [4]:
data = json.load(open(r"Data\qasper-test-v0.3.json"))

In [5]:
clean_data = []
for paper_id, paper_info in data.items():
    doc = {}
    doc['doc_id'] = paper_id
    doc['title'] = paper_info['title']
    doc['qas'] = []
    for qa in paper_info['qas']:
        answers = []
        evidence = " "

        for ans in qa['answers']:
            ext_spans_combined = " "
            extractive_spans = ans['answer']['extractive_spans']
            if len(extractive_spans)>1:
                for es in extractive_spans:
                    ext_spans_combined =ext_spans_combined+" "+es
            free_form_answer = ans['answer']['free_form_answer']
            ev = ans['answer']['evidence']
            for e in ev:
                if e not in evidence:
                    evidence=evidence+" "+e
            if extractive_spans and extractive_spans not in answers:
                answers.append(extractive_spans)
            if free_form_answer and not extractive_spans and free_form_answer not in answers :
                answers.append(free_form_answer)
            if ext_spans_combined != " " and ext_spans_combined not in answers :
                answers.append(ext_spans_combined)
        if answers:
            qa_copy = qa.copy()
            qa_copy['answers'] = answers
            qa_copy['evidence'] = evidence
            doc['qas'].append(qa_copy)
    clean_data.append(doc)

In [6]:
save_json(clean_data, r'Data\final_data.json')

In [7]:
clean_data = json.load(open(r"Data\final_data.json"))

In [8]:
len(clean_data)

416

In [9]:
data_reduced = clean_data[0:10]

In [10]:
def call_gpt(messages, model="gpt-3.5-turbo-0613"):
    completions = openai.ChatCompletion.create(
        model=model,
        n=1,
        stop=None,
        temperature = 0.0,
        messages=messages)
    gpt_response = completions['choices'][0]['message']['content'].strip() 
    return gpt_response   

Domain-expert, few-shot answers 

In [11]:
def get_answers_fs(content_example,contents,question):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an NLP domain expert question answering system.
                The user will provide you with the contents of a research paper.
                The user will also provide a question to be answered.
                Strictly answer in one sentence or less. 
                The answer should strictly be an exact extracted text from the contents.
                If an exact extracted text does not answer the question answer with a brief and precise answer derived from the contents.
                Reply in the format:
                "answer"
            """
        },
        { "role": "system", "name": "example_user", "content":f"contents:{content_example}"},
        { "role": "system", "name": "example_user", "content":"How big is the ANTISCAM dataset? "},
        { "role": "system", "name": "example_system", "content": "220 human-human dialogs"},         
        { "role": "user", "content": f"Contents of the research paper:{contents}"},
        { "role": "user", "content": f"Question related to the paper:{question}"},
    ]
    events = call_gpt(messages)
    return events

In [12]:
for i,datum in enumerate(data_reduced):
    print('{}/{}'.format(i, len(data_reduced)))
    example_content = "To enrich available non-collaborative task datasets, we created a corpus of human-human anti-scam dialogs in order to learn human elicitation strategies. We chose a popular Amazon customer service scam scenario to collect dialogs between users and attackers who aim to collect users information. We posted a role-playing task on the Amazon Mechanical Turk platform and collected a typing conversation dataset named AntiScam. We collected 220 human-human dialogs. The average conversation length is 12.45 turns and the average utterance length is 11.13 words. Only 172 out of 220 users successfully identified their partner as an attacker, suggesting that the attackers are well trained and not too easily identifiable. We recruited two expert annotators who have linguistic training to annotate 3,044 sentences in 100 dialogs, achieving a 0.874 averaged weighted kappa value."
    for qa in datum['qas']:
        contents = qa['evidence']
        LLM_answer = get_answers_fs(example_content,contents,qa['question'])
        qa['GPT_Answer'] = LLM_answer
save_json(data_reduced, r'Data\llm_responses_final.json')

0/10
1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10


Prompt used for openAI embeddings

In [31]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] 
    paragraph = " ".join(sentence_list)
    return paragraph

In [32]:
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
        stripped_sentence = re.sub(",","",stripped_sentence)
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
        stripped_sentence = re.sub(",","",stripped_sentence)
    else:
        print("!!!")
    return stripped_sentence

Helper Functions

In [34]:
def replace_punctuation_with_whitespace(input_string):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    return input_string.translate(translator)

In [35]:
def get_top_n_values(dictionary, n):
    sorted_items = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    top_n_items = sorted_items[:n]
    return dict(top_n_items)

In [36]:
def calculate_cosine(a,b):
    cosine = np.dot(a,b)/(norm(a)*norm(b))
    return cosine

Generating openAI embeddings for keywords

In [21]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [26]:
embedds = json.load(open(r'../IEEE_papers/Result/openAI_embeddings.json'))
for index,i in enumerate(main_chars):
    print('{}/{}'.format(index,len(main_chars)))
    if(i not in embedds.keys()):
        embedds[i] = get_embedding(i)
save_json(embedds, r'../IEEE_papers/Result/OpenAI_emb.json')

0/10542
1/10542
2/10542
3/10542
4/10542
5/10542
6/10542
7/10542
8/10542
9/10542
10/10542
11/10542
12/10542
13/10542
14/10542
15/10542
16/10542
17/10542
18/10542
19/10542
20/10542
21/10542
22/10542
23/10542
24/10542
25/10542
26/10542
27/10542
28/10542
29/10542
30/10542
31/10542
32/10542
33/10542
34/10542
35/10542
36/10542
37/10542
38/10542
39/10542
40/10542
41/10542
42/10542
43/10542
44/10542
45/10542
46/10542
47/10542
48/10542
49/10542
50/10542
51/10542
52/10542
53/10542
54/10542
55/10542
56/10542
57/10542
58/10542
59/10542
60/10542
61/10542
62/10542
63/10542
64/10542
65/10542
66/10542
67/10542
68/10542
69/10542
70/10542
71/10542
72/10542
73/10542
74/10542
75/10542
76/10542
77/10542
78/10542
79/10542
80/10542
81/10542
82/10542
83/10542
84/10542
85/10542
86/10542
87/10542
88/10542
89/10542
90/10542
91/10542
92/10542
93/10542
94/10542
95/10542
96/10542
97/10542
98/10542
99/10542
100/10542
101/10542
102/10542
103/10542
104/10542
105/10542
106/10542
107/10542
108/10542
109/10542
110/10542


Calculating tfdif scores for all author defined keywords.

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string 
import itertools

dat = json.load(open(r'../IEEE_papers/Events/articles_w_keywords.json'))
embedds = json.load(open(r'../IEEE_papers/Result/embed.json'))
abstracts = []
keywords = []
scores = {}
for index,datum in enumerate(dat):
    abstract = datum['Abstract'].strip()
    # print(abstract)
    keyword= datum['AuthorKeywords'].strip()
    keyword = keyword.replace("-"," ")
    abstracts.append(abstract)
    keywords.append(keyword)
# save_json(abstracts, r'../IEEE_papers/Result/abs.json')
save_json(keywords, r'../IEEE_papers/Result/keys.json')


tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(abstracts)
scores = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

keyword_scores = {}
for datum in dat:
    abstract = datum['Abstract'].strip()
    keywords= datum['AuthorKeywords'].split(',')
    keys_score = {} 
    for keys in keywords:
        keys = replace_punctuation_with_whitespace(keys)
        key = keys.split()
        score=0.0
        cnt=0
        for k in key:
            k = k.lower()
            if k in scores.keys():
                score += scores[k]
                cnt+=1
            else:
                score = 10
                cnt+=1
        if(cnt!=0) :
            score = score/cnt
        keys_score[keys] = score
    keyword_scores[abstract] = keys_score
test = dict(itertools.islice(keyword_scores.items(), 10))
save_json(keyword_scores, r'../IEEE_papers/Result/final_dict.json')

