In [None]:
from transformers import pipeline
from typing import Any, List, Dict
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_excel('CCG Output Capture.xlsx', sheet_name='Add Output Here', usecols=['type', 'title', 'abstract'], engine='openpyxl')

In [None]:
df['abstract'] = df.abstract.str.replace('\n', ' ')
df.dropna().to_csv('corpus_type_title_abstract.csv')

In [None]:
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [None]:
corpus = pd.read_csv('corpus_type_title_abstract.csv', usecols=['abstract'])['abstract'].to_list()

In [None]:
df = pd.read_csv('corpus_type_title_abstract.csv', usecols=['type', 'title', 'abstract'])
articles = df[df['type'].str.contains('Jrnl')].copy()
if articles.duplicated().any():
    articles = articles.drop_duplicates(keep='first')

article_abstracts = articles.abstract.to_list()
article_titles = articles.title.to_list()

In [None]:
QUESTIONS = ["What is the aim?",
             "What is the objective?",
             "What are the aims of the paper?",
             "What are the objectives of the article?"]

In [None]:
def answer_questions(questions: List, context: str) -> Dict[str, Any]:
    """Answers questions and returns the answer with the highest score

    Arguments
    ---------
    questions: List
        A list of questions to pose
    context: str
        The text to query

    Returns
    -------

    """
    results = []
    for question in questions:
        result = question_answerer(question=question, context=context)
        # print(f"Question: {question}")
        # print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
        results.append(result)
    results.sort(key=lambda x: x['score'])
    return results[-1]

In [None]:
answers = []
hows = []
who = []
for abstract in article_abstracts:
    results = []
    answer = answer_questions(QUESTIONS, abstract)
    answers.append(answer)

    follow_up = [f"How does it {answer}?"]
    how_result = answer_questions(follow_up, abstract)
    hows.append(how_result)


    follow_up = ["Who will be affected?",
                 "Who will be influenced?",
                 f"Who will {answer} affect?"
                 ]
    how_result = answer_questions(follow_up, abstract)
    who.append(how_result)


In [None]:
articles['answers'] = [x['answer'] for x in answers]
articles['score'] = [x['score'] for x in answers]
articles['hows'] = [x['answer'] for x in hows]
articles['how_score'] = [x['score'] for x in hows]
articles['who'] = [x['answer'] for x in who]
articles.to_excel('annotated_articles.xlsx')

In [None]:
articles.head()

# Classification

In [None]:
classify = pipeline('zero-shot-classification')

In [None]:
sequence = article_abstracts[20]
candidate_labels = ['scenarios', 'modelling', 'surveys', 'literature', 'decarbonisation', 'development', 'finance', 'green growth', 'coal', 'renewables', 'natural gas', 'data', 'cooking', "energy access"]
results = classify(sequence, candidate_labels, multi_label=True)
sns.barplot(y=results['labels'], x=results['scores'], orient='h')

In [None]:
print(sequence)
for label, score in zip(results['labels'], results['scores']):
    print(f"{label}: {score}")

In [None]:
sum(results['scores'])