<a href="https://colab.research.google.com/github/ChrisBagdon/Citation_Classification/blob/main/tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from string import punctuation
from math import log

In [2]:
text = "By clustering with lowly aggressive close kin (King 1989a,b; Viblanc et al. 2010; Arnaud, Dobson & Murie 2012), breeding females may decrease the time/energy cost of maintaining territorial boundaries (Festa-Bianchet & Boag 1982; Murie & Harris 1988), which could ultimately lead to increases in net energy income (TA) or higher allocations in somatic or reproductive functions."

In [3]:
tokens = {}

In [4]:
def tokenize(text, tokens):
  cur_token =""
  for char in text:
    # Check if is apart of token
    if char.isalnum() or char == "'":
      cur_token += char
      continue
    # Check for space
    elif char == " ":
      if len(cur_token) > 0:
        if cur_token in tokens:
          tokens[cur_token] += 1
          cur_token = ""
          continue
        else:
          tokens[cur_token] = 1
          cur_token = ""
          continue
    # Check if punctuation
    else:
      if len(cur_token) > 0:
        if cur_token in tokens:
          tokens[cur_token] += 1
          cur_token = ""
        else:
          tokens[cur_token] = 1
          cur_token = ""
      if char in tokens:
        tokens[char] += 1
        continue
      else:
        tokens[char] = 1
        continue
  if len(cur_token) > 0:
    if cur_token in tokens:
          tokens[cur_token] += 1
          cur_token = ""
    else:
      tokens[cur_token] = 1
      cur_token = ""


In [5]:
tokenize(text, tokens)

In [6]:
text_2 = "Ophthalmic symptoms are rare manifestations of the intracranial arachnoid cyst, and include unilateral exophthalmos, visual field abnormality, decreased visual acuity and isolated palsies of the third, fourth and sixth cranial nerves [1–5]."

In [7]:
tokenize(text_2, tokens)

In [8]:
print(tokens.items())

dict_items([('By', 1), ('clustering', 1), ('with', 1), ('lowly', 1), ('aggressive', 1), ('close', 1), ('kin', 1), ('(', 3), ('King', 1), ('1989a', 1), (',', 8), ('b', 1), (';', 3), ('Viblanc', 1), ('et', 1), ('al', 1), ('.', 3), ('2010', 1), ('Arnaud', 1), ('Dobson', 1), ('&', 3), ('Murie', 2), ('2012', 1), (')', 3), ('breeding', 1), ('females', 1), ('may', 1), ('decrease', 1), ('the', 3), ('time', 1), ('/', 1), ('energy', 2), ('cost', 1), ('of', 3), ('maintaining', 1), ('territorial', 1), ('boundaries', 1), ('Festa', 1), ('-', 1), ('Bianchet', 1), ('Boag', 1), ('1982', 1), ('Harris', 1), ('1988', 1), ('which', 1), ('could', 1), ('ultimately', 1), ('lead', 1), ('to', 1), ('increases', 1), ('in', 2), ('net', 1), ('income', 1), ('TA', 1), ('or', 2), ('higher', 1), ('allocations', 1), ('somatic', 1), ('reproductive', 1), ('functions', 1), ('Ophthalmic', 1), ('symptoms', 1), ('are', 1), ('rare', 1), ('manifestations', 1), ('intracranial', 1), ('arachnoid', 1), ('cyst', 1), ('and', 3), ('in

In [9]:
class naive_bayes:
  def __init__(self):
      self.labels = {}
      self.doc_count = 0
      self.bin_size = 0
  
  def train(self, X, Y):
    for string, label in zip(X, Y):
      # Count instances of labels
      if label not in self.labels:
        self.labels[label] = {'count':1, 'terms':{}}
      else:
        self.labels[label]['count'] += 1
      # Count tokens from document
      tokenize(string, self.labels[label]['terms'])
      # Increase total document count
      self.doc_count += 1
    
    # Tally bin_size for smoothing
    terms_list = []
    for label, labels_dic in self.labels.items():
      terms_list = terms_list + list(labels_dic['terms'].keys())
    self.bin_size += len(set(terms_list))
    # Calculate class statistics
    for label, labels_dic in self.labels.items():
      # Calculate label prior probability
      self.labels[label]["prior"] = labels_dic['count'] / self.doc_count
      # Save total number of tokens in label + smoothing
      self.labels[label]["term_count"] = sum(labels_dic['terms'].values())+self.bin_size
      # Calculate probability of each token in label + smoothing
      terms = labels_dic['terms'].keys()
      #for term in terms:
        #print((labels_dic['terms'][term]+1) / labels_dic["term_count"])
      self.labels[label]["term_probs"] = {term:(labels_dic['terms'][term]+1) /
                                          labels_dic["term_count"]
                                           for term in terms}

  def predict(self, X, use_log=True):
    predictions = []

    for string in X:
      tokens = {}
      tokenize(string, tokens)
      probabilities = []
      for label, label_dic in self.labels.items():
        if use_log:
          prob = sum(log(label_dic["term_probs"][token])*count 
                        if token in label_dic['terms']
                        else log(1/label_dic['term_count'])*count 
                        for token, count in tokens.items()) \
                  + log(label_dic['prior'])
        else:
          prob = label_dic['prior']
          for token, count in tokens.items():
            if token in label_dic['terms'].keys():
              prob = prob*(label_dic["term_probs"][token]**count)
            else:
              prob = prob*((1/label_dic['term_count'])**count)
              #print(prob)
        probabilities.append((label, prob))
      predictions.append(max(probabilities, key=lambda item:item[1])[0])

    return predictions

      



In [10]:
X_train = ['Red Blue Blue',
         'Blue Blue Green',
         'Blue Yellow',
         'Big Small Blue']
Y_train = ['color', 'color', 'color', 'size']
X_test = ['Blue Blue Blue Big Small']

In [11]:
model = naive_bayes()
model.train(X_train, Y_train)

In [12]:
predictions = model.predict(X_test)

In [13]:
predictions

['color']

In [14]:
model.labels

{'color': {'count': 3,
  'terms': {'Red': 1, 'Blue': 5, 'Green': 1, 'Yellow': 1},
  'prior': 0.75,
  'term_count': 14,
  'term_probs': {'Red': 0.14285714285714285,
   'Blue': 0.42857142857142855,
   'Green': 0.14285714285714285,
   'Yellow': 0.14285714285714285}},
 'size': {'count': 1,
  'terms': {'Big': 1, 'Small': 1, 'Blue': 1},
  'prior': 0.25,
  'term_count': 9,
  'term_probs': {'Big': 0.2222222222222222,
   'Small': 0.2222222222222222,
   'Blue': 0.2222222222222222}}}

In [15]:
import csv

with open('scicite/tsv/train.tsv') as train_file:
  train_data = csv.reader(train_file, delimiter="\t")
  X_train, Y_train = [],[]
  for row in train_data:
    X_train.append(row[2])
    Y_train.append(row[3])

In [16]:
Y_train

['background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'method',
 'background',
 'background',
 'background',
 'background',
 'method',
 'background',
 'background',
 'background',
 'result',
 'background',
 'method',
 'background',
 'background',
 'result',
 'background',
 'background',
 'background',
 'background',
 'method',
 'method',
 'background',
 'background',
 'background',
 'background',
 'method',
 'result',
 'background',
 'background',
 'result',
 'result',
 'result',
 'background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'method',
 'background',
 'background',
 'background',
 'method',
 'background',
 'method',
 'background',
 'background',
 'background',
 'background',
 'background',
 'background',
 'result',
 'background',
 'background',
 'method',
 'background',
 'background',
 'background',
 'background',
 'method',
 'background',
 'backg

In [17]:
X_train

['However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).',
 'In the study by Hickey et al. (2012), spikes were sampled from the field at the point of physiological\\nrobinson et al.: genomic regions influencing root traits in barley 11 of 13\\nmaturity, dried, grain threshed by hand, and stored at −20C to preserve grain dormancy before germination testing.',
 'The drug also reduces catecholamine secretion, thereby reducing stress and leading to a modest (10-20%) reduction in heart rate and blood pressure, which may be particularly beneficial in patients with cardiovascular disease.(7) Unlike midazolam, dexmedetomidine does not affect the ventilatory response to carbon dioxide.',
 'By clustering with lowly aggressive close kin (King 1989a,b; Viblanc et al. 2010; Arnaud, Dobson & Murie 2012), breeding females may decrease t

In [18]:
with open('scicite/tsv/dev.tsv') as dev_file:
  dev_data = csv.reader(dev_file, delimiter="\t")
  X_dev, Y_dev = [],[]
  for row in dev_data:
    X_dev.append(row[2])
    Y_dev.append(row[3])

In [19]:
model = naive_bayes()
model.train(X_train, Y_train)

In [20]:
model.labels

{'background': {'count': 4840,
  'terms': {'However': 112,
   ',': 14305,
   'how': 30,
   'frataxin': 4,
   'interacts': 16,
   'with': 1118,
   'the': 6206,
   'Fe': 14,
   '-': 3404,
   'S': 49,
   'cluster': 16,
   'biosynthesis': 12,
   'components': 42,
   'remains': 21,
   'unclear': 11,
   'as': 1081,
   'direct': 38,
   'one': 148,
   'to': 2722,
   'interactions': 35,
   'each': 66,
   'component': 19,
   'were': 322,
   'reported': 234,
   '(': 6108,
   'IscS': 3,
   '[': 2406,
   '12': 191,
   '22': 114,
   ']': 2396,
   'IscU': 2,
   '/': 423,
   'Isu1': 1,
   '6': 234,
   '11': 203,
   '16': 152,
   'or': 654,
   'ISD11': 1,
   'Isd11': 2,
   '14': 160,
   '15': 160,
   ')': 6128,
   '.': 10488,
   'In': 335,
   'study': 184,
   'by': 924,
   'Hickey': 4,
   'et': 5107,
   'al': 5107,
   '2012': 268,
   'spikes': 2,
   'sampled': 3,
   'from': 642,
   'field': 37,
   'at': 365,
   'point': 44,
   'of': 5290,
   'physiological': 21,
   '\\': 580,
   'nrobinson': 1,
   ':':

In [21]:
predictions = model.predict(X_dev)

In [22]:
import numpy as np
import pandas as pd
def evaluate(predictions, gold_standard):
    # Collect all unique labels from predictions and gold_std
    labels_set = set(predictions + gold_standard)
    labels = {}
    for i, label in enumerate(labels_set):
        labels[label] = i
    # Create confusion matrix
    confusion_matrix = np.zeros((len(labels_set),len(labels_set)))
    for pred, gold in zip(predictions, gold_standard):
        confusion_matrix[labels[pred]][labels[gold]] += 1
    labels_index = list(labels_set); labels_index.append('overall')
    columns = []
    # Create scores table
    scores = pd.DataFrame(np.zeros((len(labels_set), 3)))
    scores.columns = ['Precision', 'Recall', 'F1']
    overall_TP = 0
    # Calculate P, R, F1 and populate scores table
    for label in labels_set:
        i = labels[label]
        scores['Precision'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=0)[i]
        scores['Recall'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=1)[i]
        # Possible error case: P == 0 == R; divide by 0
        if scores['Precision'][i] == 0 and scores['Recall'][i] == 0:
            scores['F1'][i] = 0
        else:
            scores['F1'][i] = 2 * (scores['Precision'][i]*scores['Recall'][i]/(scores['Precision'][i]+scores['Recall'][i]))
        overall_TP += confusion_matrix[i][i]
    scores.loc[len(labels_set)] = [overall_TP / np.sum(confusion_matrix)] * 3
    scores.index = labels_index
    return (confusion_matrix, scores)

In [23]:
cf, scores = evaluate(predictions, Y_dev)

In [24]:
print(scores)

            Precision    Recall        F1
background   0.871747  0.805842  0.837500
result       0.560976  0.821429  0.666667
method       0.725490  0.740000  0.732673
overall      0.789301  0.789301  0.789301


In [25]:
background = pd.DataFrame(model.labels['background']['term_probs'].items()).sort_values(by=[1], ascending=False).reset_index()
result = pd.DataFrame(model.labels['result']['term_probs'].items()).sort_values(by=[1], ascending=False).reset_index()
method = pd.DataFrame(model.labels['method']['term_probs'].items()).sort_values(by=[1], ascending=False).reset_index()

term_prob = pd.DataFrame([background[0][:50],background[1][:50],result[0][:50],result[1][:50],method[0][:50],method[1][:50],]).transpose()

In [26]:
term_prob.columns = ['background','b prob', 'result', 'result prob', 'method', 'method prob']
term_prob.head(10)

Unnamed: 0,background,b prob,result,result prob,method,method prob
0,",",0.056401,",",0.034595,",",0.039839
1,.,0.041353,.,0.028823,.,0.030278
2,the,0.024471,the,0.018712,the,0.027081
3,),0.024163,of,0.016628,(,0.022378
4,(,0.024085,in,0.01499,),0.022297
5,and,0.022898,(,0.014887,and,0.01874
6,of,0.02086,),0.014532,of,0.017057
7,et,0.020138,and,0.014291,-,0.01483
8,al,0.020138,et,0.013181,[,0.011442
9,;,0.01603,al,0.013169,],0.011391


In [27]:
# Parsing the jsonl files
import jsonlines

train_jsonl_list = []

with jsonlines.open('scicite/jsonl/train.jsonl') as f:
    for line in f.iter():
        train_jsonl_list.append(line)

In [28]:
### Putting the jsonl files into a pandas dataframe
train_jsonl = pd.DataFrame(train_jsonl_list)
train_jsonl.head()

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,


# Exploration of where the baseline makes mistakes

In [45]:
cf

array([[469.,  47.,  66.],
       [ 11.,  69.,   4.],
       [ 58.,   7., 185.]])

In [33]:
dev_tsv = pd.read_csv('scicite/tsv/dev.tsv', sep='\t', 
                      names=["citingPaperID", "source", "string", "true_label"])
dev_tsv.head()

Unnamed: 0,citingPaperID,source,string,true_label
0,8f1fbe460a901d994e9b81d69f77bfbe32719f4c>5e413...,explicit,These results are in contrast with the finding...,result
1,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,explicit,…nest burrows in close proximity of one anothe...,background
2,226f798d30e5523c5b9deafb826ddb04d47c11dc>None_0,explicit,This is clearly in contrast to the results of ...,result
3,59dba7cd80edcce831d20b35f9eb597bba290154>27399...,explicit,"…in a subset of alcoholics (Chen et al., 2004;...",background
4,0640f6e098a9d241cd680473e8705357ae101e04>e33da...,explicit,This result is consistent with the conclusions...,result


In [35]:
dev_tsv["prediction_label"] = predictions
dev_tsv.head()

Unnamed: 0,citingPaperID,source,string,true_label,prediction_label
0,8f1fbe460a901d994e9b81d69f77bfbe32719f4c>5e413...,explicit,These results are in contrast with the finding...,result,result
1,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,explicit,…nest burrows in close proximity of one anothe...,background,background
2,226f798d30e5523c5b9deafb826ddb04d47c11dc>None_0,explicit,This is clearly in contrast to the results of ...,result,result
3,59dba7cd80edcce831d20b35f9eb597bba290154>27399...,explicit,"…in a subset of alcoholics (Chen et al., 2004;...",background,background
4,0640f6e098a9d241cd680473e8705357ae101e04>e33da...,explicit,This result is consistent with the conclusions...,result,result


In [57]:
dev_background = dev_tsv.copy()
dev_background = dev_background[dev_background['true_label'] == "background"]
dev_background = dev_background[dev_background['prediction_label'] != "background"]
dev_background.head()

Unnamed: 0,citingPaperID,source,string,true_label,prediction_label
5,bf4f0f114016f1fd0afe1070cb98715d72278ff3>54445...,explicit,Another examples of twisted bicrossproduct Hop...,background,method
23,51ba9f12ed2558d76fb13b0332119ddc5c912071>3b7a1...,explicit,The NA-ACCORD began collecting data from multi...,background,method
24,132aee4607633866fe73bd7f50e01dd57460fcf8>a016a...,explicit,In this section we recall the standard quantiz...,background,method
28,54925222fd7e91bdede6ae219fe1adcedd3a27d3>7f6d5...,explicit,"The temperament domains are Novelty Seeking, H...",background,method
40,82fe79e6afcbd138440ff430f3d421ca57cd1c75>700a5...,explicit,Under the assumption that oxy-hemoglobin (HbO2...,background,method


In [69]:
list(dev_background[dev_background['prediction_label'] == "method"]['string'])

['Another examples of twisted bicrossproduct Hopf algebra are the null-plane quantized Poincaré algebra [6] and extended jordanian deformations of U(sl(N)).',
 'The NA-ACCORD began collecting data from multi- and singlesite interval and clinical cohorts in 2006.[46] The Institute of Medicine of the National Academies (IOM) has promulgated the NA-ACCORD, due to its size and demographic similarity with PLWHA in the U.',
 'In this section we recall the standard quantization of a scalar field used in LQG [8, 9, 10].',
 'The temperament domains are Novelty Seeking, Harm Avoidance, Reward Dependence, and Persistence, and the character domains are Self-Directedness, Cooperation, and Self-Transcendence.(101) Reliability of the Czech translation of the method is also satisfactory.',
 'Under the assumption that oxy-hemoglobin (HbO2), deoxy-hemoglobin (Hb) and water are the major absorbers in the probed tissue volume at the two wavelengths considered, we calculated tissue concentrations of HbO2 a

In [58]:
dev_result = dev_tsv.copy()
dev_result = dev_result[dev_result['true_label'] == "result"]
dev_result = dev_result[dev_result['prediction_label'] != "result"]
dev_result.head()

Unnamed: 0,citingPaperID,source,string,true_label,prediction_label
6,952660f4a305ebc8bc46ea23dd1ea41464d6681f>6b122...,explicit,Our results confirm the other studies suggesti...,result,background
11,e28904703eb7a8052f7b26618bfcaf1b64311efa>39d74...,explicit,This cell binding pattern was identical to tha...,result,background
13,55ba23b8df7dcdb3ac149a0038cf1cc2dc1541e7>cb6b3...,explicit,"Nevertheless, a response-rate of 41% was reach...",result,background
16,6824c1a5072c81ca8a55b7aa15903d76fc2b12d5>cb521...,explicit,"At present, whether the contrasting results ob...",result,background
34,00eea3644491b0fcf74c49aa211fa2a24a6e3d3e>2271b...,explicit,"…tick attachment (60% and 65% tick attachment,...",result,background


In [70]:
list(dev_result[dev_result['prediction_label'] == "method"]['string'])

['1 was refined by Bates and Mansour [1] and by Stanley and Wilf [14] who proved results that are equivalent to the following.',
 'A higher accuracy was found when the benchmark of the sequence-based prediction excluded two residues for each N- and C- end of the Q3 regions, resulting in RMSD values that are similar to those obtained with SPARTA+ (Shen and Bax 2010) and better than those associated with Camshift (Kohlhoff et\xa0al.',
 'Although the internally quenched substrate Abz-(R)4SAGnYamide was shown previously to be efficiently cleaved by DEN NS2B(H)-NS3pro, (kcat/Km: 11087 M 21 s(21)) [22], binding affinity and cleavage efficiency (kcat/Km: 19 M 21 s(21)) of this peptide was substantially lower (approx.',
 'The accuracy of the MBIM technique was validated in a previous study (Krosshaug and Bahr, 2005).',
 'This is in line with earlier reports of optimal threshold procedures overestimating the true performance [3,4,6,8,9].',
 '…the best-performing dimensionality on the clustering

In [59]:
dev_method = dev_tsv.copy()
dev_method = dev_method[dev_method['true_label'] == "method"]
dev_method = dev_method[dev_method['prediction_label'] != "method"]
dev_method.head()

Unnamed: 0,citingPaperID,source,string,true_label,prediction_label
29,204b237317b0b25a015904e45cab010bcaf87fa9>427b9...,explicit,"As opposed to earlier studies (Cook et al., 20...",method,background
33,3ccc9c6b2da5e6c8b724953c910d2539bba339aa>00bc4...,explicit,"As a proxy of fruit and vegetable intake, we u...",method,background
71,7fd57efb068707fda773060caf597a2fda33c595>81ab4...,explicit,"Also, procedures followed for assessment of LV...",method,background
99,7656852ff2a275e2bc318aec0604225e6b4d49df>a5d32...,explicit,"Furthermore, ChIP-seq data of histone modifica...",method,background
102,3611fa9de1d9cd97a8dd2b576c6faa036900b859>86918...,explicit,…that used insect larvae (most often caterpill...,method,background
