<a href="https://colab.research.google.com/github/ChrisBagdon/Citation_Classification/blob/main/tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from string import punctuation
from math import log

In [None]:
text = "By clustering with lowly aggressive close kin (King 1989a,b; Viblanc et al. 2010; Arnaud, Dobson & Murie 2012), breeding females may decrease the time/energy cost of maintaining territorial boundaries (Festa-Bianchet & Boag 1982; Murie & Harris 1988), which could ultimately lead to increases in net energy income (TA) or higher allocations in somatic or reproductive functions."

In [25]:
tokens = {}

In [24]:
def tokenize(text, tokens):
  cur_token =""
  for char in text:
    # Check if is apart of token
    if char.isalnum() or char == "'":
      cur_token += char
      continue
    # Check for space
    elif char == " ":
      if len(cur_token) > 0:
        if cur_token in tokens:
          tokens[cur_token] += 1
          cur_token = ""
          continue
        else:
          tokens[cur_token] = 1
          cur_token = ""
          continue
    # Check if punctuation
    else:
      if len(cur_token) > 0:
        if cur_token in tokens:
          tokens[cur_token] += 1
          cur_token = ""
        else:
          tokens[cur_token] = 1
          cur_token = ""
      if char in tokens:
        tokens[char] += 1
        continue
      else:
        tokens[char] = 1
        continue
  if len(cur_token) > 0:
    if cur_token in tokens:
          tokens[cur_token] += 1
          cur_token = ""
    else:
      tokens[cur_token] = 1
      cur_token = ""


In [4]:
tokenize(text, tokens)

NameError: ignored

In [None]:
text_2 = "Ophthalmic symptoms are rare manifestations of the intracranial arachnoid cyst, and include unilateral exophthalmos, visual field abnormality, decreased visual acuity and isolated palsies of the third, fourth and sixth cranial nerves [1–5]."

In [None]:
tokenize(text_2, tokens)

In [None]:
print(tokens.items())

dict_items([('By', 1), ('clustering', 1), ('with', 1), ('lowly', 1), ('aggressive', 1), ('close', 1), ('kin', 1), ('(', 3), ('King', 1), ('1989a', 1), (',', 8), ('b', 1), (';', 3), ('Viblanc', 1), ('et', 1), ('al', 1), ('.', 3), ('2010', 1), ('Arnaud', 1), ('Dobson', 1), ('&', 3), ('Murie', 2), ('2012', 1), (')', 3), ('breeding', 1), ('females', 1), ('may', 1), ('decrease', 1), ('the', 3), ('time', 1), ('/', 1), ('energy', 2), ('cost', 1), ('of', 3), ('maintaining', 1), ('territorial', 1), ('boundaries', 1), ('Festa', 1), ('-', 1), ('Bianchet', 1), ('Boag', 1), ('1982', 1), ('Harris', 1), ('1988', 1), ('which', 1), ('could', 1), ('ultimately', 1), ('lead', 1), ('to', 1), ('increases', 1), ('in', 2), ('net', 1), ('income', 1), ('TA', 1), ('or', 2), ('higher', 1), ('allocations', 1), ('somatic', 1), ('reproductive', 1), ('functions', 1), ('Ophthalmic', 1), ('symptoms', 1), ('are', 1), ('rare', 1), ('manifestations', 1), ('intracranial', 1), ('arachnoid', 1), ('cyst', 1), ('and', 3), ('in

In [124]:
class naive_bayes:
  def __init__(self):
      self.labels = {}
      self.doc_count = 0
      self.bin_size = 0
  
  def train(self, X, Y):
    for string, label in zip(X, Y):
      # Count instances of labels
      if label not in self.labels:
        self.labels[label] = {'count':1, 'terms':{}}
      else:
        self.labels[label]['count'] += 1
      # Count tokens from document
      tokenize(string, self.labels[label]['terms'])
      # Increase total document count
      self.doc_count += 1
    
    # Tally bin_size for smoothing
    terms_list = []
    for label, labels_dic in self.labels.items():
      terms_list = terms_list + list(labels_dic['terms'].keys())
    self.bin_size += len(set(terms_list))
    # Calculate class statistics
    for label, labels_dic in self.labels.items():
      # Calculate label prior probability
      self.labels[label]["prior"] = labels_dic['count'] / self.doc_count
      # Save total number of tokens in label + smoothing
      self.labels[label]["term_count"] = sum(labels_dic['terms'].values())+self.bin_size
      # Calculate probability of each token in label + smoothing
      terms = labels_dic['terms'].keys()
      #for term in terms:
        #print((labels_dic['terms'][term]+1) / labels_dic["term_count"])
      self.labels[label]["term_probs"] = {term:(labels_dic['terms'][term]+1) /
                                          labels_dic["term_count"]
                                           for term in terms}

  def predict(self, X, use_log=True):
    predictions = []

    for string in X:
      tokens = {}
      tokenize(string, tokens)
      probabilities = []
      for label, label_dic in self.labels.items():
        if use_log:
          prob = sum(log(label_dic["term_probs"][token])*count 
                        if token in label_dic['terms']
                        else log(1/label_dic['term_count'])*count 
                        for token, count in tokens.items()) \
                  + log(label_dic['prior'])
        else:
          prob = label_dic['prior']
          for token, count in tokens.items():
            if token in label_dic['terms'].keys():
              prob = prob*(label_dic["term_probs"][token]**count)
            else:
              prob = prob*((1/label_dic['term_count'])**count)
              #print(prob)
        probabilities.append((label, prob))
      predictions.append(max(probabilities, key=lambda item:item[1])[0])

    return predictions

      



In [93]:
X_train = ['Red Blue Blue',
         'Blue Blue Green',
         'Blue Yellow',
         'Big Small Blue']
Y_train = ['color', 'color', 'color', 'size']
X_test = ['Blue Blue Blue Big Small']

In [97]:
model = naive_bayes()
model.train(X_train, Y_train)

In [98]:
predictions = model.predict(X_test)

In [99]:
predictions

[('color', -8.10769031284391)]

In [100]:
model.labels

{'color': {'count': 3,
  'prior': 0.75,
  'term_count': 14,
  'term_probs': {'Blue': 0.42857142857142855,
   'Green': 0.14285714285714285,
   'Red': 0.14285714285714285,
   'Yellow': 0.14285714285714285},
  'terms': {'Blue': 5, 'Green': 1, 'Red': 1, 'Yellow': 1}},
 'size': {'count': 1,
  'prior': 0.25,
  'term_count': 9,
  'term_probs': {'Big': 0.2222222222222222,
   'Blue': 0.2222222222222222,
   'Small': 0.2222222222222222},
  'terms': {'Big': 1, 'Blue': 1, 'Small': 1}}}

7

In [118]:
import csv

with open('drive/MyDrive/train.tsv') as train_file:
  train_data = csv.reader(train_file, delimiter="\t")
  X_train, Y_train = [],[]
  for row in train_data:
    X_train.append(row[2])
    Y_train.append(row[3])

In [None]:
Y_train

In [None]:
X_train

In [121]:
with open('drive/MyDrive/dev.tsv') as dev_file:
  dev_data = csv.reader(dev_file, delimiter="\t")
  X_dev, Y_dev = [],[]
  for row in dev_data:
    X_dev.append(row[2])
    Y_dev.append(row[3])

In [125]:
model = naive_bayes()
model.train(X_train, Y_train)

In [None]:
model.labels

In [126]:
predictions = model.predict(X_dev)

In [132]:
import numpy as np
import pandas as pd
def evaluate(predictions, gold_standard):
    # Collect all unique labels from predictions and gold_std
    labels_set = set(predictions + gold_standard)
    labels = {}
    for i, label in enumerate(labels_set):
        labels[label] = i
    # Create confusion matrix
    confusion_matrix = np.zeros((len(labels_set),len(labels_set)))
    for pred, gold in zip(predictions, gold_standard):
        confusion_matrix[labels[pred]][labels[gold]] += 1
    labels_index = list(labels_set); labels_index.append('overall')
    columns = []
    # Create scores table
    scores = pd.DataFrame(np.zeros((len(labels_set), 3)))
    scores.columns = ['Precision', 'Recall', 'F1']
    overall_TP = 0
    # Calculate P, R, F1 and populate scores table
    for label in labels_set:
        i = labels[label]
        # Possible error case (Precision): denominator == 0; divide by 0
        if np.sum(confusion_matrix, axis=0)[i] == 0:
            scores['Precision'][i] = 0
        else:
            scores['Precision'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=0)[i]
        # Possible error case (Recall): denominator == 0; divide by 0
        if np.sum(confusion_matrix, axis=1)[i] == 0:
            scores['Recall'][i] = 0
        else:
            scores['Recall'][i] = confusion_matrix[i][i] / np.sum(confusion_matrix, axis=1)[i]
        # Possible error case: P == 0 == R; divide by 0
        if scores['Precision'][i] == 0 and scores['Recall'][i] == 0:
            scores['F1'][i] = 0
        else:
            scores['F1'][i] = 2 * (scores['Precision'][i]*scores['Recall'][i]/(scores['Precision'][i]+scores['Recall'][i]))
        overall_TP += confusion_matrix[i][i]
    scores.loc[len(labels_set)] = [overall_TP / np.sum(confusion_matrix)] * 3
    scores.index = labels_index
    return (confusion_matrix, scores)

In [133]:
cf, scores = evaluate(predictions, Y_dev)

In [134]:
print(scores)

            Precision    Recall        F1
result       0.560976  0.821429  0.666667
method       0.725490  0.740000  0.732673
background   0.871747  0.805842  0.837500
overall      0.789301  0.789301  0.789301
