<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/ENEE439d-TEXTML/TextML/blob/master/input.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/ENEE439d-TEXTML/TextML/blob/master/input.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

# Imports

In [81]:
!pip install -q -U "tensorflow-text==2.8.*" # A dependency of the preprocessing for BERT inputs
!pip install -q tf-models-official==2.7.0 # For adamW
!pip install focal-loss # focal loss implmnetion for tf
!pip install pdfminer.six #pdf text extratction



In [82]:
import pandas as pd #basic imports
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras import layers
import re

In [83]:
from sklearn.model_selection import train_test_split # https://www.tensorflow.org/text/tutorials/classify_text_with_bert
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization, bert  # to create AdamW optimizer
from focal_loss import SparseCategoricalFocalLoss
import official.nlp.bert.tokenization

In [84]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

In [85]:
import seaborn as sns; sns.set_theme()

# Data read in

In [86]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [87]:
filepath = '/content/drive/MyDrive/Text-ML/full_sentiment_dataset.csv' #'data.csv'
df= pd.read_csv(filepath)
df1=df.drop(['no','paper','context_a','context_b'],axis=1)
df1.head()

Unnamed: 0,cited_paper,label,text
0,A00-2024,0,We analyzed a set of articles and identified s...
1,A00-2024,0,Table 3: Example compressions Compression AvgL...
2,A00-2024,0,5.3 Related works and discussion Our two-step ...
3,A00-2024,0,(1999) proposed a summarization system based o...
4,A00-2024,0,We found that the deletion of lead parts did n...


In [88]:
df1['label'].value_counts()

 0    7627
 1     829
-1     280
Name: label, dtype: int64

# Filtering by regex

In [89]:
context=df1['text']

re1= "\(((([A-Za-z]+ *)+(, \d+))+(; )*)+\)" # matches author and author, year
re_year=",? \(?\d{4}\)?" # match , {4 digits} which may be wrapped in () 
re_and="(and|&) "
re_auth="((\w+, )*(\w+ )+)"
re_et= re_auth+"et al\. ?"+re_year # matches author et al. , year
re_2a= re_auth+"("+re_and+"((\w+ *))?)?"+re_year # matches author and author, year
re_sep="((; )|( "+re_and+"))*"# match the '; ' gap or ' and ' gap
re_para_year="\(\d{4}\)"
re_in_brack="\[*\]"
re_apa =re_in_brack+"|"+re_para_year+"|"+"\(?("+"(\(?"+re_2a+"|"+re_et+"\)?)"+re_sep+")+"
print(re_apa)

\[*\]|\(\d{4}\)|\(?((\(?((\w+, )*(\w+ )+)((and|&) ((\w+ *))?)?,? \(?\d{4}\)?|((\w+, )*(\w+ )+)et al\. ?,? \(?\d{4}\)?\)?)((; )|( (and|&) ))*)+


In [90]:
def remove_matches(text,regex=re_apa):
  text1=text
  rem_len=0
  pattern= re.compile(regex)
  while True:
    matches=pattern.search(text1)
    #print(matches)
    if matches == None:
      break

    spn=matches.span()
    text1=text1[0:spn[0]]+text1[spn[1]:-1]
    cit_len=spn[1]-spn[0]
    rem_len+=cit_len
  
  if len(text) >0:
    percent_removed=rem_len/len(text)
  else:
    percent_removed=1
  return text1,percent_removed 

# print(context[5])
# remove_citation(context[5],regex=re_apa)

In [91]:
output=df1['text'].apply(lambda x: remove_matches(text=x,regex=re_apa)) #df['col1'] = df.apply(lambda x: complex_function(x['col1']), axis=1)
df_o = pd.DataFrame(list(output), columns =['clean','p_rem'])
output_1=df_o['clean'].apply(lambda x: remove_matches(text=x,regex='[^\w_0-9 ]+')) 
df_o_1 = pd.DataFrame(list(output_1), columns =['clean','p_rem'])
#df_o.head()

df1['text_clean']=df_o_1['clean']
df1['text_clean_len']=df_o_1['clean'].apply(len)
df1['p_rem']=df_o['p_rem']

In [92]:
df1

Unnamed: 0,cited_paper,label,text,text_clean,text_clean_len,p_rem
0,A00-2024,0,We analyzed a set of articles and identified s...,We analyzed a set of articles and identified s...,425,0.098765
1,A00-2024,0,Table 3: Example compressions Compression AvgL...,Table 3 Example compressions Compression AvgLe...,229,0.260745
2,A00-2024,0,5.3 Related works and discussion Our two-step ...,53 Related works and discussion Our twostep mo...,105,0.308176
3,A00-2024,0,(1999) proposed a summarization system based o...,proposed a summarization system based on the ...,321,0.078804
4,A00-2024,0,We found that the deletion of lead parts did n...,We found that the deletion of lead parts did n...,73,0.408000
...,...,...,...,...,...,...
8731,W96-0213,1,He has achieved state-of-the art results by ap...,He has achieved stateofthe art results by appl...,139,0.151515
8732,W96-0213,0,"B = (Brill and Wu, 1998); M = (Magerman, 1995)...",B M Magerman 1995 O our data R Ratnaparkhi 1,48,0.421488
8733,W96-0213,0,The model we use is similar to that of (Ratnap...,The model we use is similar to that of Ratnapa...,55,0.000000
8734,W96-0213,1,Our model exploits the same kind of tag-n-gram...,Our model exploits the same kind of tagngram i...,157,0.000000


# Remove under and over sized samples
large samples appear to be poorly written

In [93]:
def getMidLen(data,label,labelKey='label',lenKey='text_clean_len',lowMod=1,highMod=1):
  df1 =data.loc[data[labelKey] == label]
  neu_mean=np.mean(list(df1[lenKey]))
  neu_std=np.std(list(df1[lenKey]))
  df1_no_high = df1.loc[df1[lenKey] < highMod*(neu_mean +neu_std)]
  # print(neu_mean)
  # print(neu_std)

  while neu_std > neu_mean:
    neu_mean=np.mean(list(df1_no_high['text_clean_len']))
    neu_std=np.std(list(df1_no_high['text_clean_len']))
    # print(neu_mean)
    # print(neu_std)
    df1_no_high = df1.loc[df1['text_clean_len'] < highMod*(neu_mean +neu_std)]

  df1_mid = df1_no_high.loc[df1_no_high['text_clean_len'] > lowMod*(neu_mean -neu_std)]

  return df1_mid

df2 = df1.loc[df1['p_rem'] < .5] #keep sampels with less than half of it are citation

df_neu=getMidLen(df2,0,lowMod=2)
df_pos=getMidLen(df2,1,lowMod=1,highMod=2)
df_neg=getMidLen(df2,-1,lowMod=1,highMod=2)
df3= pd.concat([df_neg,df_neu,df_pos])
df3['label'].value_counts()

 0    2524
 1     746
-1     246
Name: label, dtype: int64

In [94]:
def catagorize(data,labelKey='label'):
  rows=len(data.index)
  onehots=np.zeros((rows,3),dtype=int)
  for i,lab in enumerate(data[labelKey]):
    onehots[i][lab+1]=1
  return onehots

hots=catagorize(df3)
df3['label_onehot']=list(hots)
df3['label_index']=df3['label']+1

In [95]:
freq= np.array(list(df3['label_index'].value_counts(normalize=True,sort=False)))
print(freq)
class_ratio= 1/freq
class_ratio

[0.06996587 0.71786121 0.21217292]


array([14.29268293,  1.39302694,  4.71313673])

#Load Model

In [96]:
X_train, X_test, y_train, y_test = train_test_split(list(df3['text_clean']), list(df3['label_index']), test_size=0.2, random_state=42)
X_train= [[s] for s in X_train]
X_test= [[s] for s in X_test]
y_train=[[s] for s in list(y_train)]
y_test=[[s] for s in list(y_test)]

In [97]:
#@title Choose a BERT model to fine-tune (Taken from tutorial)

bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


## check model passes

In [98]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [99]:
text_test = X_train[1]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [  101  1999  5688 11416  6024  4275  2024  4738  2000 25845  1996  4101]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [100]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [101]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.99835694 -0.7678578  -0.21300124  0.08411987 -0.08593949  0.98550844
  0.9732349  -0.8306031  -0.55687106 -0.95725054 -0.3960305  -0.94115573]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[ 3.8915187e-01  2.3267061e-01  6.1780311e-02 ... -9.2866874e-01
   4.3027106e-01  8.3279318e-01]
 [ 4.5310837e-01  7.2308904e-01 -3.2866317e-01 ... -1.1163950e-04
  -3.3482751e-01  5.7274055e-01]
 [-2.5876865e-01  1.4199525e+00 -4.2525381e-01 ...  4.9038833e-01
   1.4491324e-01  5.7048750e-01]
 ...
 [ 2.6529512e-01 -2.3668993e-01  8.4921330e-02 ...  2.0211086e-02
   3.9103544e-01  8.9449620e-01]
 [ 2.9499257e-01  5.8424294e-01 -6.7344594e-01 ... -1.6148384e+00
   1.3211843e+00 -6.0517174e-01]
 [-1.8697177e-01  5.2527428e-01  9.2377967e-01 ... -5.6088662e-01
   1.0293359e+00 -7.8856331e-01]]


## full model setup

In [102]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(3, activation='softmax', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [103]:
classifier_model = build_classifier_model()

## check loss function

In [104]:
bert_raw_result = classifier_model(tf.constant(text_test))
print(bert_raw_result)

l =  SparseCategoricalFocalLoss(gamma=2,class_weight=class_ratio)
test =tf.convert_to_tensor([1.0])
l(test,bert_raw_result)

tf.Tensor([[0.1488465 0.2751187 0.5760347]], shape=(1, 3), dtype=float32)


<tf.Tensor: shape=(), dtype=float32, numpy=0.9446458>

# Train Save and Log

In [105]:
epochs = 5
steps_per_epoch = 200 #tf.data.experimental.cardinality(X_train).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 2e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

# def auc_wrapper(y_true,y_pred):
#   print(y_true,y_pred)

#   y_true=tf.reshape(y_true,[1])
#   print(y_true)
#   y_true= tf.cast(y_true, tf.int32)
#   print(y_true)
#   y_true=tf.one_hot(y_true,depth=3)
#   print(y_true)
#   return tf.keras.metrics.AUC(multi_label=True)(y_true,y_pred)


loss =  SparseCategoricalFocalLoss(gamma=2,class_weight=class_ratio) #tf.keras.losses.MeanSquaredError()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]#, auc_wrapper]#, tf.keras.metrics.AUC(multi_label=True)]


classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath="citation_BERT_{epoch}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_sparse_categorical_accuracy",
        verbose=1
    ),
    tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
]


In [106]:
# print(f'Training model with {tfhub_handle_encoder}')
# history = classifier_model.fit(x=X_train,y=y_train, validation_data=(X_test,y_test),epochs=epochs,callbacks= callbacks, verbose=True)

In [107]:
# classifier_model.save_weights(''/content/drive/MyDrive/Text-ML/checkpoint1')

# Inspect model

In [108]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [109]:
#%tensorboard --logdir=logs

In [110]:
classifier_model.load_weights('/content/drive/MyDrive/Text-ML/checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f28fb9c16d0>

In [111]:
# preds=classifier_model.predict(X_train,verbose=1)

In [112]:
# preds_t=classifier_model.predict(X_test,verbose=1)

In [113]:
# c_mat=tf.math.confusion_matrix(np.argmax(preds,-1),y_train)
# ax = sns.heatmap(c_mat,annot=True,linewidths=.5)

In [114]:
# c_mat=tf.math.confusion_matrix(np.argmax(preds_t,-1),y_test)
# ax = sns.heatmap(c_mat,annot=True,linewidths=.5)

# Get attention colorings

https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1


In [115]:
classifier_model.layers

[<keras.engine.input_layer.InputLayer at 0x7f28e777bd90>,
 <tensorflow_hub.keras_layer.KerasLayer at 0x7f28e77c58d0>,
 <tensorflow_hub.keras_layer.KerasLayer at 0x7f28f4fe8a90>,
 <keras.layers.core.dropout.Dropout at 0x7f28e722e3d0>,
 <keras.layers.core.dense.Dense at 0x7f28e7160b10>]

In [116]:
from official.nlp import bert 
import official.nlp.bert.tokenization

In [117]:
tokenizer = bert.tokenization.FullTokenizer(vocab_file='/content/drive/MyDrive/Text-ML/vocab.txt')
preprocesser_model = keras.Model(inputs=classifier_model.input,outputs=classifier_model.get_layer('preprocessing').output)
encoder_model = keras.Model(inputs=classifier_model.input,outputs=classifier_model.get_layer('BERT_encoder').output)

In [118]:
print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


## Attnetion token mapping

In [119]:
def get_attn(context,prep,encoder): # assume stirng array input
  t_context=tf.convert_to_tensor(context)

  p_out=prep(t_context)
  #print(p_out)
  stop_index=0
  while(stop_index< p_out["input_mask"].shape[1] and p_out["input_mask"][0][stop_index] == 1):
    stop_index+=1
  
  if stop_index >= 128:
    stop_index=127

  output = encoder(t_context)
  #print(output["sequence_output"].shape)
  valid_entries=output["sequence_output"][:,1:stop_index-1,:]
  a=tf.math.reduce_mean(valid_entries,-1)
  mean=tf.math.reduce_mean(a,-1,keepdims=True)
  std=tf.math.reduce_std(a,-1,keepdims=True)
  a1=(a-mean)/std

  return a1

In [120]:
def get_attn_for_words(context,tokenizer,prep,encoder):
  attn = get_attn(context,prep,encoder).numpy()
  tokens = tokenizer.tokenize(context[0]) 

  indicies=np.ones((len(tokens)),dtype=int)
  for i,tok in enumerate(tokens):
    if '##' in tok:
      indicies[i]=0

  full_words=tokens.copy()
  ix=-1
  for i,tok in enumerate(tokens):
    if not indicies[i]:
      attn[0][ix]+=attn[0][i]
      full_words[ix]+=tok[2:]
    else:
      ix=i

  t_f=tf.convert_to_tensor(full_words) #stores as byte string...
  masked_f=tf.boolean_mask(t_f,indicies)
  t_a=tf.convert_to_tensor(attn)[0]
  masked_a=tf.boolean_mask(t_a[:len(indicies)],indicies)
  

  return masked_f.numpy(),masked_a.numpy()

#words, at=get_attn_for_words(processed,tokenizer,preprocesser_model,encoder_model)

## converters and annotation

In [121]:
def process_for_input(raw_context):
  c1,_ =remove_matches(text=raw_context,regex=re_apa)
  c2,_ =remove_matches(text=c1,regex='[^\w_\-0-9 ]+')
  return [c2]

#process_for_input(example)

In [122]:
def conv_bytes_strs(words):
  return [w.decode('UTF-8') for w in list(words)]

In [123]:
def conv_to_color(attn): #blue pos red neg
  rgbs = np.zeros((len(attn),3),dtype=int)
  for i,score in enumerate(attn):
    if score < 0:
      rgbs[i][0]=-255*score//2
    else:
      rgbs[i][2]=255*score//2
  
  return rgbs

In [124]:
def coloring(text,fore=None,back=None):
    txt=text
    if fore != None and fore[0] != -1:
      txt = "\033[38;2;{};{};{}m".format(fore[0], fore[1], fore[2])+txt
    if back != None and back[0] != -1:
      txt = "\033[48;2;{};{};{}m".format(back[0], back[1], back[2])+txt
    return txt

#print(coloring('Hello',back=[500,0,0]) + coloring('Hello', back=(0,0,255)))

In [125]:
def text_class(text,model,d=3):
  codes=tf.constant([[-1.0,0.0,1.0]])
  pred=model.predict(text)[0]
  i=tf.argmax(pred)
  res=codes*pred
  def roundDown(n, d=2):
    d = int('1' + ('0' * d))
    return np.floor(np.array(n) * d) / d
  pred=roundDown(pred,d)
  score=tf.math.reduce_mean(res,-1)[0].numpy()
  classification=roundDown(codes[0,i].numpy(),d)
  max_confidence=pred[i]
  return classification,max_confidence,roundDown(score,d),list(pred)

## full pipeline

In [126]:
def color_by_attn(text,toker,preper,encoder):
  all_words_original=text.split()
  all_words=text.lower().split()
  processed= process_for_input(text)

  words, at=get_attn_for_words(processed,tokenizer,preprocesser_model,encoder_model)

  total_at_abs=np.sum(np.absolute(at))

  ws=conv_bytes_strs(words)
  conv=conv_to_color(at)
  mapping=dict(zip(ws,conv))
  orig_mapping=dict(zip(all_words,all_words_original))

  for i,w in enumerate(all_words):
    if w not in mapping:
      mapping[w]=[0,0,0] #make black
    else:
      mapping[w]=list(mapping[w])

  colored=[coloring(orig_mapping[word],fore=[255,255,255],back=mapping[word]) for word in all_words]
  printed=' '.join(colored)
  return printed,total_at_abs

# PDF text extratction


In [153]:
def convert_pdf_to_string(file_path):
	output_string = StringIO()
	with open(file_path, 'rb') as in_file:
	    parser = PDFParser(in_file)
	    doc = PDFDocument(parser)
	    rsrcmgr = PDFResourceManager()
	    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
	    interpreter = PDFPageInterpreter(rsrcmgr, device)
	    for page in PDFPage.create_pages(doc):
	        interpreter.process_page(page)

	return(output_string.getvalue())
 
def sent_extract(sample):
	delims=re.findall('\. +[A-Z]',sample)
	sents=re.split('\. +[A-Z]',sample)

	sents[0]=sents[0]+'.'
	for i,s in enumerate(sents[1:]):
		sents[i+1]=delims[i][-1]+s+'.'
	for i,s in enumerate(sents):
		sents[i]=re.sub('\d+https(\w|\:|\/|\.|\?|\=|\-|\&)+','',s)
	
	return sents

def pdf_text_extract(path):
  text=convert_pdf_to_string(path)
  text1 = text.replace('\x0c','')
  text2 = text1.split('.\n\n')
  refine=[t.replace('\n',' ') for t in text2]
  r=[]
  for t in refine:
    r+=sent_extract(t)
  return r

In [154]:
def get_content(text):
  abs_end=0
  ref_start=len(text)-1
  while abs_end< len(text) and 'INTRODUCTION' not in text[abs_end]: #start at the intro
    abs_end+=1
  end_first=abs_end
  while  end_first< len(text) and '∗' not in text[end_first]: #start at the intro
    end_first+=1
  copy_mark=end_first
  while  copy_mark< len(text) and '©' not in text[copy_mark]: #start at the intro
    copy_mark+=1
  while  ref_start> 0 and 'REFERENCES' not in text[ref_start]: 
    ref_start-=1
  
  if abs_end == len(text): #fail and return
    return text
  return text[abs_end:end_first]+text[copy_mark+1:ref_start]

# Coloring of a PDF

In [129]:
path='/content/drive/MyDrive/Text-ML/phocus.pdf'
text=pdf_text_extract(path)

In [130]:
content=get_content(text)

In [160]:
def label_text(content,p=True,print_err=False):
  errs=[]
  dfs=[]
  sect='Unknown'
  cols=['sentence','section','class','confidence','net score','net attention','neg','neu','pos','text','colored']
  i=0
  for t in content:
    try:
      if ', ,' in t or len(t) < 20:
        continue
      match =re.match('^\d+ [A-Z]+ ?(\w+)? [A-Z]',t)
      sp=1
      if match != None:
        sp=match.span()[1]-2
        if p:
          print(t[:sp])
        sect=re.split('\d+ ',t[:sp])[1]
        t=t[sp:]

      c,conf,sc,raw=text_class([t],classifier_model)
      colored,abs_at=color_by_attn(t,tokenizer,preprocesser_model,encoder_model)
      df_t = pd.DataFrame([[i,sect,c,conf,sc,abs_at,raw[0],raw[1],raw[2],t,colored]],columns=cols)
      if p:
        print(i,c,conf,colored)
      dfs.append(df_t)
      i+=1
        
    except Exception:
      if print_err:
        print('err:', t)
  
  return pd.concat(dfs)

In [132]:
# df_phocus=label_text(content)

In [133]:
# df_phocus

## Full paper labeling pipeline

In [138]:
def label(path,p=True,print_err=False):
  text=pdf_text_extract(path)
  content=get_content(text)
  return text,content,label_text(content,p,print_err)

In [162]:
#@title Click the run button to upload the pdf you want to scan.
from google.colab import files
! cd "/content"
uploaded = files.upload()

pdf = "/content/" + list(uploaded.keys())[0]

Saving bhi-ar-2016.pdf to bhi-ar-2016 (2).pdf


In [163]:
a,b,c=label(pdf)

0 0.0 0.659 [48;2;0;0;40m[38;2;255;255;255mINTRODUCTION [48;2;0;0;139m[38;2;255;255;255mActivity [48;2;0;0;191m[38;2;255;255;255mrecognition [48;2;318;0;0m[38;2;255;255;255mresearch [48;2;250;0;0m[38;2;255;255;255moriginally [48;2;0;0;227m[38;2;255;255;255mutilized [48;2;75;0;0m[38;2;255;255;255mspecially [48;2;199;0;0m[38;2;255;255;255mengineered [48;2;48;0;0m[38;2;255;255;255mdevices [48;2;0;0;45m[38;2;255;255;255mdistributed [48;2;0;0;2m[38;2;255;255;255macross [48;2;0;0;115m[38;2;255;255;255ma [48;2;0;0;0m[38;2;255;255;255msubject’s [48;2;118;0;0m[38;2;255;255;255mbody [48;2;0;0;0m[38;2;255;255;255m[2, [48;2;0;0;0m[38;2;255;255;255m6, [48;2;0;0;0m[38;2;255;255;255m11] [48;2;26;0;0m[38;2;255;255;255mto [48;2;182;0;0m[38;2;255;255;255midentify [48;2;0;0;186m[38;2;255;255;255mthe [48;2;0;0;0m[38;2;255;255;255msubject’s [48;2;0;0;66m[38;2;255;255;255mphysical [48;2;0;0;0m[38;2;255;255;255mactivities, [48;2;88;0;0m[38;2;255;255;255mbut [4

In [161]:
c

Unnamed: 0,sentence,section,class,confidence,net score,net attention,neg,neu,pos,text,colored
0,0,ABSTRACT,0.0,0.797,0.066,21.192471,0.001,0.797,0.200,We propose a novel loss we term the Focal Loss...,[48;2;205;0;0m[38;2;255;255;255mwe [48;2;0;...
0,1,ABSTRACT,1.0,0.953,0.315,22.240728,0.007,0.039,0.953,"As our experiments will demonstrate, the propo...",[48;2;33;0;0m[38;2;255;255;255mAs [48;2;105...
0,2,ABSTRACT,1.0,0.668,0.193,31.278921,0.089,0.242,0.668,"In contrast, one-stage detectors that are appl...",[48;2;0;0;289m[38;2;255;255;255mIn [48;2;0;...
0,3,ABSTRACT,0.0,0.502,0.102,7.899637,0.095,0.502,0.402,"In this paper, we investigate why this is the ...",[48;2;0;0;51m[38;2;255;255;255mIn [48;2;0;0...
0,4,ABSTRACT,0.0,0.615,0.116,19.203249,0.018,0.615,0.366,We discover that the ex- treme foreground-back...,[48;2;319;0;0m[38;2;255;255;255mWe [48;2;99...
...,...,...,...,...,...,...,...,...,...,...,...
0,295,ABSTRACT,1.0,0.749,0.249,9.156259,0.001,0.249,0.749,Rapid object detection using a boosted cascad...,[48;2;75;0;0m[38;2;255;255;255mRapid [48;2;...
0,296,ABSTRACT,0.0,0.523,0.104,4.544265,0.081,0.523,0.394,"In CVPR, 2001. 2, 3 [38] S.",[48;2;0;0;29m[38;2;255;255;255mIn [48;2;0;0...
0,297,ABSTRACT,0.0,0.814,0.057,5.918126,0.005,0.814,0.179,Aggregated residual transformations for deep n...,[48;2;73;0;0m[38;2;255;255;255mAggregated [...
0,298,ABSTRACT,0.0,0.458,0.115,4.598346,0.097,0.458,0.443,"In CVPR, 2017. 8 [39] C.",[48;2;0;0;12m[38;2;255;255;255mIn [48;2;0;0...


In [None]:
df_phocus.sort_values(by=['net score'])

Unnamed: 0,sentence,section,class,confidence,net score,net attention,neg,neu,pos,text,colored
0,31,RELATED WORK,-1.0,0.924,-0.305,19.884472,0.924,0.066,0.009,"However, this method generates the overall sen...","[48;2;0;0;0m[38;2;255;255;255mHowever, [48;..."
0,123,EXPERIMENTS,-1.0,0.917,-0.297,16.340233,0.917,0.055,0.027,"However, we cannot complete this job yet out o...","[48;2;0;0;0m[38;2;255;255;255mHowever, [48;..."
0,10,INTRODUCTION,-1.0,0.764,-0.226,12.599020,0.764,0.145,0.089,Quantitative metrics could not evaluate the re...,[48;2;0;0;89m[38;2;255;255;255mQuantitative ...
0,141,EXPERIMENTS,-1.0,0.597,-0.192,10.496807,0.597,0.378,0.023,Those metrics are derived from citations and d...,[48;2;0;0;5m[38;2;255;255;255mThose [48;2;4...
0,77,RELATED WORK,-1.0,0.597,-0.192,10.496807,0.597,0.378,0.023,Those metrics are derived from citations and d...,[48;2;0;0;5m[38;2;255;255;255mThose [48;2;4...
...,...,...,...,...,...,...,...,...,...,...,...
0,0,INTRODUCTION,1.0,0.909,0.296,8.146835,0.020,0.070,0.909,The number of papers published each year has ...,[48;2;72;0;0m[38;2;255;255;255mThe [48;2;8;...
0,115,METHODOLOGY,1.0,0.907,0.299,7.527385,0.009,0.082,0.907,The main idea is shown in Figure 3.,[48;2;0;0;85m[38;2;255;255;255mThe [48;2;11...
0,125,EXPERIMENTS,1.0,0.915,0.303,16.606018,0.004,0.079,0.915,"Besides, we also compare our modules to other ...","[48;2;0;0;0m[38;2;255;255;255mBesides, [48;..."
0,142,EXPERIMENTS,1.0,0.934,0.303,6.461682,0.023,0.041,0.934,Semantic Scholar makes the first step towards ...,[48;2;72;0;0m[38;2;255;255;255mSemantic [48...


In [None]:
df_phocus.sort_values(by=['net attention'])

Unnamed: 0,sentence,section,class,confidence,net score,net attention,neg,neu,pos,text,colored
0,129,EXPERIMENTS,-1.0,0.437,-0.011,3.302047,0.437,0.158,0.404,"As we emphasize, Pat.",[48;2;123;0;0m[38;2;255;255;255mAs [48;2;0;...
0,119,METHODOLOGY,1.0,0.733,0.237,4.644296,0.020,0.246,0.733,The first one is margin effects.,[48;2;83;0;0m[38;2;255;255;255mThe [48;2;21...
0,74,RELATED WORK,0.0,0.599,0.055,4.656850,0.116,0.599,0.283,higher or equal to ℎ.,[48;2;199;0;0m[38;2;255;255;255mhigher [48;...
0,172,CONCLUSION,1.0,0.829,0.248,4.685307,0.083,0.086,0.829,Phocus still need improvements.,[48;2;0;0;182m[38;2;255;255;255mPhocus [48;...
0,5,INTRODUCTION,-1.0,0.631,-0.153,4.988726,0.631,0.193,0.174,They state this opinion in two aspects.,[48;2;0;0;39m[38;2;255;255;255mThey [48;2;2...
...,...,...,...,...,...,...,...,...,...,...,...
0,117,METHODOLOGY,0.0,0.533,0.154,46.593880,0.001,0.533,0.464,"Then, the academic influential factor of 𝐴 is:...","[48;2;0;0;0m[38;2;255;255;255mThen, [48;2;6..."
0,116,METHODOLOGY,0.0,0.940,0.019,58.259689,0.000,0.940,0.058,𝐴 denote a citing paper with academic influent...,[48;2;0;0;0m[38;2;255;255;255m𝐴 [48;2;24;0;...
0,144,EXPERIMENTS,0.0,0.940,0.012,60.146317,0.010,0.940,0.048,The features Seman- tic Scholar use are total ...,[48;2;3;0;0m[38;2;255;255;255mthe [48;2;0;0...
0,81,RELATED WORK,0.0,0.935,0.015,62.142685,0.009,0.935,0.054,The features Semantic Scholar use are the tota...,[48;2;0;0;34m[38;2;255;255;255mthe [48;2;0;...


In [None]:
overall_paper_sentiment=np.mean(df_phocus['net attention'])
overall_paper_sentiment

18.59335708618164

In [None]:
overall_paper_sentiment=np.mean(df_phocus['net score'])
overall_paper_sentiment

0.08068181818181822