## Claims Extraction
#### Using discourse and claim detection paper here: https://arxiv.org/abs/1907.00962
#### github link of the model by author:  https://github.com/titipata/detecting-scientific-claim

In [None]:
import pandas as pd
import os
os.chdir("detecting-scientific-claim")
import preprocess as pp

import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

import os
import sys
import json
from itertools import chain
import torch
from torch.nn import ModuleList, Linear
import torch.nn.functional as F
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize

from lxml import etree, html
import urllib

import flask
from flask import Flask, request
from gevent.pywsgi import WSGIServer

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
from allennlp.common.file_utils import cached_path
from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.modules import Seq2VecEncoder, TimeDistributed, TextFieldEmbedder, ConditionalRandomField, FeedForward

from discourse import read_json
from discourse.dataset_readers import ClaimAnnotationReaderJSON, CrfPubmedRCTReader
from discourse.predictors import DiscourseClassifierPredictor


In [None]:
import gc
gc.collect()

In [None]:
class ClaimCrfPredictor(Predictor):
    """"
    Predictor wrapper for the AcademicPaperClassifier
    """
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
#         print(json_dict)
        sentences = json_dict['sentences']
        instance = self._dataset_reader.text_to_instance(sents=sentences)
        return instance


!wget https://s3-us-west-2.amazonaws.com/pubmed-rct/model_crf.tar.gz ## the  CRF model weight

In [None]:
archive = load_archive("./model_crf.tar.gz") ## available at github
predictor = Predictor.from_archive(archive, 'discourse_crf_predictor')
archive_ = load_archive("./model_crf.tar.gz")
discourse_predictor = Predictor.from_archive(archive_, 'discourse_crf_predictor')
gc.collect()

In [5]:
# WEIGHT_PATH="https://s3-us-west-2.amazonaws.com/pubmed-rct/model_crf_tf.th"
WEIGHT_PATH="./model_crf_tf.th"
model = predictor._model
for param in list(model.parameters()):
    param.requires_grad = False ## not train weights
EMBEDDING_DIM = 300
num_classes, constraints, include_start_end_transitions = 2, None, False
model.crf = ConditionalRandomField(num_classes, constraints, 
                                   include_start_end_transitions=include_start_end_transitions)
model.label_projection_layer = TimeDistributed(Linear(2 * EMBEDDING_DIM, num_classes))
model.load_state_dict(torch.load(cached_path(WEIGHT_PATH), map_location='cpu'))
reader = CrfPubmedRCTReader()
claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader)

In [6]:
%%capture
text_input="The present world is facing a devastating reality as drug abuse prevails in every corner of a society. The progress of a country is obstructed due to the excessive practice of taking drugs by the young generation. Like other countries, Bangladesh is also facing this dreadful situation. The multiple use of drug substances leads an individual to a sorrowful destination and for this reason, the natural behavior of human mind is disrupted. An addicted individual may regain his normal life by proper monitoring and treatment. The objective of this study is to analyze a mathematical model on the dynamics of drug abuse in the perspective of Bangladesh and reduce the harmful consequences with effective control policies using the idea of optimal control theory. The model has been solved analytically introducing a specific optimal goal. Numerical simulations have also been performed to review the behaviors of analytical findings. The analytical results have been verified with the numerical simulations. The analysis of this paper shows that it is possible to control drug addiction if there is less interaction among general people with the addicted individuals. Family based care, proper medical treatment, awareness and educational programs can be the most effective ways to reduce the adverse effects of drug addiction in a shortest possible time."
print(text_input)

In [7]:
# text_input=df_agg[df_agg['section']=='abstract'].sentence.iloc[2]
#
article = {'title': '', 'abstract': text_input}

abstract = article.get('abstract', '')
sentences = sent_tokenize(abstract)
labels = []

In [8]:
text_input

'The present world is facing a devastating reality as drug abuse prevails in every corner of a society. The progress of a country is obstructed due to the excessive practice of taking drugs by the young generation. Like other countries, Bangladesh is also facing this dreadful situation. The multiple use of drug substances leads an individual to a sorrowful destination and for this reason, the natural behavior of human mind is disrupted. An addicted individual may regain his normal life by proper monitoring and treatment. The objective of this study is to analyze a mathematical model on the dynamics of drug abuse in the perspective of Bangladesh and reduce the harmful consequences with effective control policies using the idea of optimal control theory. The model has been solved analytically introducing a specific optimal goal. Numerical simulations have also been performed to review the behaviors of analytical findings. The analytical results have been verified with the numerical simul

In [10]:
%%capture
discourse_output = discourse_predictor.predict_json({'abstract': abstract})
labels = discourse_output['labels']
pred = claim_predictor.predict_json({'sentences': sentences})
best_paths = model.crf.viterbi_tags(torch.FloatTensor(pred['logits']).unsqueeze(0), 
                                    torch.LongTensor(pred['mask']).unsqueeze(0))
p_claims = 100 * np.array(best_paths[0][0])
p_claims = list(p_claims)

In [11]:
data = {'sents': sentences,
        'scores': p_claims,
        'labels': labels,
        'len': len,
        'enumerate': enumerate,
        'zip': zip}
data.update(article)

In [12]:
data['scores']

[0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 100]

In [13]:
# df=pd.read_csv("../Data/cord_titles_abstracts_conclusions.csv")
df=pd.read_csv("../Data/section_text_with_drug_mentions_ann_200620.csv").drop('Unnamed: 0',axis=1)
df=df.drop_duplicates(["cord_uid","text"]).reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,cord_uid,section,text,drug_terms_used
0,019rcbpg,Potential biological mechanisms of SARS-CoV-2 ...,there are indications in the literature of a n...,ifn-gamma
1,01es0zv4,Abstract,coronavirus disease 2019 has become a global p...,"chloroquine,lopinavir,remdesivir,ritonavir,azi..."
2,01es0zv4,CONCLUSION,covid-19 is a pandemic with high morbidity and...,"chloroquine,lopinavir,remdesivir,ritonavir,azi..."
3,01es0zv4,CONCLUSION:,covid-19 is a pandemic with high morbidity and...,"chloroquine,lopinavir,remdesivir,ritonavir,azi..."
4,01lyavy2,Abstract,"then, the really positive treatment could be t...",protein s


In [15]:
df_abs=df.rename(columns={"text":"sentences"}).copy()

df_abs['sentences'] = df_abs.sentences.apply(sent_tokenize)
# labels = []

In [16]:
df_abs.columns

Index(['cord_uid', 'section', 'sentences', 'drug_terms_used'], dtype='object')

In [18]:
%%capture 
## this would stop the output from getting printed
df_abs['pred'] = df_abs.sentences.apply(lambda x:claim_predictor.predict_json({'sentences': x}))
df_abs['best_paths'] = df_abs.pred.apply(lambda x: model.crf.viterbi_tags(torch.FloatTensor(x['logits']).unsqueeze(0), 
                                    torch.LongTensor(x['mask']).unsqueeze(0)))
df_abs['p_claims']=df_abs['best_paths'].apply(lambda x:100*np.array(x[0][0]))

In [19]:
df_abs['claims']=df_abs.apply(lambda x: np.extract(x['p_claims'],x['sentences']),axis=1)
# df_abs['claims_id']=df_abs.apply(lambda x:np.extract(x['p_claims'],x['sentence_bid']), axis=1)

In [20]:
# df_abs.loc[df_abs.claims.str.len()==0,"claims"]=np.empty((len(df_abs.loc[df_abs.claims.str.len()==0]), 0)).tolist()
df_claims=df_abs[~(df_abs.claims.str.len()==0)]

In [21]:
df_updated=df_claims[["cord_uid","section","claims"]].explode("claims") ## converting list to rows
df_updated=df_updated.drop_duplicates().reset_index(drop=True)
df_updated['claim_flag']=1

In [22]:
df_merged=df.merge(df_updated,on=["cord_uid","section"],how="left")
df_merged['claim_flag']=df_merged['claim_flag'].fillna(0)

In [23]:
df_merged.to_csv("../Output/claims_flag_each_cord_uid_200620.csv",index=False)