<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/ENEE439d-TEXTML/TextML/blob/master/input.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/ENEE439d-TEXTML/TextML/blob/master/input.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Imports and datset setup

In [None]:
!pip install -q -U "tensorflow-text==2.8.*" # A dependency of the preprocessing for BERT inputs
!pip install -q tf-models-official==2.7.0 # For adamW
!pip install focal-loss # focal loss implmnetion for tf



In [59]:
import pandas as pd #basic imports
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras import layers
import re

In [None]:
from sklearn.model_selection import train_test_split # https://www.tensorflow.org/text/tutorials/classify_text_with_bert
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization, bert  # to create AdamW optimizer
from focal_loss import SparseCategoricalFocalLoss
import official.nlp.bert.tokenization

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filepath = '/content/drive/MyDrive/Text-ML/full_sentiment_dataset.csv' #'data.csv'
df= pd.read_csv(filepath)
df1=df.drop(['no','paper','context_a','context_b'],axis=1)
df1.head()

Unnamed: 0,cited_paper,label,text
0,A00-2024,0,We analyzed a set of articles and identified s...
1,A00-2024,0,Table 3: Example compressions Compression AvgL...
2,A00-2024,0,5.3 Related works and discussion Our two-step ...
3,A00-2024,0,(1999) proposed a summarization system based o...
4,A00-2024,0,We found that the deletion of lead parts did n...


In [None]:
df1['label'].value_counts()

 0    7627
 1     829
-1     280
Name: label, dtype: int64

In [None]:
context=df1['text']

re1= "\(((([A-Za-z]+ *)+(, \d+))+(; )*)+\)" # matches author and author, year
re_year=",? \(?\d{4}\)?" # match , {4 digits} which may be wrapped in () 
re_and="(and|&) "
re_auth="((\w+, )*(\w+ )+)"
re_et= re_auth+"et al\. ?"+re_year # matches author et al. , year
re_2a= re_auth+"("+re_and+"((\w+ *))?)?"+re_year # matches author and author, year
re_sep="((; )|( "+re_and+"))*"# match the '; ' gap or ' and ' gap
re_para_year="\(\d{4}\)"
re_in_brack="\[*\]"
re_apa =re_in_brack+"|"+re_para_year+"|"+"\(?("+"(\(?"+re_2a+"|"+re_et+"\)?)"+re_sep+")+"
print(re_apa)

\[*\]|\(\d{4}\)|\(?((\(?((\w+, )*(\w+ )+)((and|&) ((\w+ *))?)?,? \(?\d{4}\)?|((\w+, )*(\w+ )+)et al\. ?,? \(?\d{4}\)?\)?)((; )|( (and|&) ))*)+


In [None]:
def remove_matches(text,regex=re_apa):
  text1=text
  rem_len=0
  pattern= re.compile(regex)
  while True:
    matches=pattern.search(text1)
    #print(matches)
    if matches == None:
      break

    spn=matches.span()
    text1=text1[0:spn[0]]+text1[spn[1]:-1]
    cit_len=spn[1]-spn[0]
    rem_len+=cit_len
  
  if len(text) >0:
    percent_removed=rem_len/len(text)
  else:
    percent_removed=1
  return text1,percent_removed 

# print(context[5])
# remove_citation(context[5],regex=re_apa)

In [None]:
output=df1['text'].apply(lambda x: remove_matches(text=x,regex=re_apa)) #df['col1'] = df.apply(lambda x: complex_function(x['col1']), axis=1)
df_o = pd.DataFrame(list(output), columns =['clean','p_rem'])
output_1=df_o['clean'].apply(lambda x: remove_matches(text=x,regex='[^\w_0-9 ]+')) 
df_o_1 = pd.DataFrame(list(output_1), columns =['clean','p_rem'])
#df_o.head()

df1['text_clean']=df_o_1['clean']
df1['text_clean_len']=df_o_1['clean'].apply(len)
df1['p_rem']=df_o['p_rem']

In [None]:
df1

Unnamed: 0,cited_paper,label,text,text_clean,text_clean_len,p_rem
0,A00-2024,0,We analyzed a set of articles and identified s...,We analyzed a set of articles and identified s...,425,0.098765
1,A00-2024,0,Table 3: Example compressions Compression AvgL...,Table 3 Example compressions Compression AvgLe...,229,0.260745
2,A00-2024,0,5.3 Related works and discussion Our two-step ...,53 Related works and discussion Our twostep mo...,105,0.308176
3,A00-2024,0,(1999) proposed a summarization system based o...,proposed a summarization system based on the ...,321,0.078804
4,A00-2024,0,We found that the deletion of lead parts did n...,We found that the deletion of lead parts did n...,73,0.408000
...,...,...,...,...,...,...
8731,W96-0213,1,He has achieved state-of-the art results by ap...,He has achieved stateofthe art results by appl...,139,0.151515
8732,W96-0213,0,"B = (Brill and Wu, 1998); M = (Magerman, 1995)...",B M Magerman 1995 O our data R Ratnaparkhi 1,48,0.421488
8733,W96-0213,0,The model we use is similar to that of (Ratnap...,The model we use is similar to that of Ratnapa...,55,0.000000
8734,W96-0213,1,Our model exploits the same kind of tag-n-gram...,Our model exploits the same kind of tagngram i...,157,0.000000


# Remove under and over sized samples
large samples appear to be poorly written

In [None]:
def getMidLen(data,label,labelKey='label',lenKey='text_clean_len',lowMod=1,highMod=1):
  df1 =data.loc[data[labelKey] == label]
  neu_mean=np.mean(list(df1[lenKey]))
  neu_std=np.std(list(df1[lenKey]))
  df1_no_high = df1.loc[df1[lenKey] < highMod*(neu_mean +neu_std)]
  # print(neu_mean)
  # print(neu_std)

  while neu_std > neu_mean:
    neu_mean=np.mean(list(df1_no_high['text_clean_len']))
    neu_std=np.std(list(df1_no_high['text_clean_len']))
    # print(neu_mean)
    # print(neu_std)
    df1_no_high = df1.loc[df1['text_clean_len'] < highMod*(neu_mean +neu_std)]

  df1_mid = df1_no_high.loc[df1_no_high['text_clean_len'] > lowMod*(neu_mean -neu_std)]

  return df1_mid

df2 = df1.loc[df1['p_rem'] < .5] #keep sampels with less than half of it are citation

df_neu=getMidLen(df2,0,lowMod=2)
df_pos=getMidLen(df2,1,lowMod=1,highMod=2)
df_neg=getMidLen(df2,-1,lowMod=1,highMod=2)
df3= pd.concat([df_neg,df_neu,df_pos])
df3['label'].value_counts()

 0    2524
 1     746
-1     246
Name: label, dtype: int64

In [None]:
def catagorize(data,labelKey='label'):
  rows=len(data.index)
  onehots=np.zeros((rows,3),dtype=int)
  for i,lab in enumerate(data[labelKey]):
    onehots[i][lab+1]=1
  return onehots

hots=catagorize(df3)
df3['label_onehot']=list(hots)
df3['label_index']=df3['label']+1

In [None]:
freq= np.array(list(df3['label_index'].value_counts(normalize=True,sort=False)))
print(freq)
class_ratio= 1/freq
class_ratio

[0.06996587 0.71786121 0.21217292]


array([14.29268293,  1.39302694,  4.71313673])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(list(df3['text_clean']), list(df3['label_index']), test_size=0.2, random_state=42)
X_train= [[s] for s in X_train]
X_test= [[s] for s in X_test]
y_train=[[s] for s in list(y_train)]
y_test=[[s] for s in list(y_test)]

In [None]:
#X_train

In [None]:
#@title Choose a BERT model to fine-tune (Taken from tutorial)

bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
text_test = X_train[1]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [  101  1999  5688 11416  6024  4275  2024  4738  2000 25845  1996  4101]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.99835694 -0.7678578  -0.21300124  0.08411987 -0.08593949  0.98550844
  0.9732349  -0.8306031  -0.55687106 -0.95725054 -0.3960305  -0.94115573]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[ 3.8915187e-01  2.3267061e-01  6.1780311e-02 ... -9.2866874e-01
   4.3027106e-01  8.3279318e-01]
 [ 4.5310837e-01  7.2308904e-01 -3.2866317e-01 ... -1.1163950e-04
  -3.3482751e-01  5.7274055e-01]
 [-2.5876865e-01  1.4199525e+00 -4.2525381e-01 ...  4.9038833e-01
   1.4491324e-01  5.7048750e-01]
 ...
 [ 2.6529512e-01 -2.3668993e-01  8.4921330e-02 ...  2.0211086e-02
   3.9103544e-01  8.9449620e-01]
 [ 2.9499257e-01  5.8424294e-01 -6.7344594e-01 ... -1.6148384e+00
   1.3211843e+00 -6.0517174e-01]
 [-1.8697177e-01  5.2527428e-01  9.2377967e-01 ... -5.6088662e-01
   1.0293359e+00 -7.8856331e-01]]


In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(3, activation='softmax', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

In [None]:
bert_raw_result = classifier_model(tf.constant(text_test))
print(bert_raw_result)

l =  SparseCategoricalFocalLoss(gamma=2,class_weight=class_ratio)
test =tf.convert_to_tensor([1.0])
l(test,bert_raw_result)

tf.Tensor([[0.4861831  0.43295795 0.08085901]], shape=(1, 3), dtype=float32)


<tf.Tensor: shape=(), dtype=float32, numpy=0.37495145>

In [None]:
epochs = 5
steps_per_epoch = 200 #tf.data.experimental.cardinality(X_train).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 2e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

# def auc_wrapper(y_true,y_pred):
#   print(y_true,y_pred)

#   y_true=tf.reshape(y_true,[1])
#   print(y_true)
#   y_true= tf.cast(y_true, tf.int32)
#   print(y_true)
#   y_true=tf.one_hot(y_true,depth=3)
#   print(y_true)
#   return tf.keras.metrics.AUC(multi_label=True)(y_true,y_pred)


loss =  SparseCategoricalFocalLoss(gamma=2,class_weight=class_ratio) #tf.keras.losses.MeanSquaredError()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]#, auc_wrapper]#, tf.keras.metrics.AUC(multi_label=True)]


classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath="citation_BERT_{epoch}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_sparse_categorical_accuracy",
        verbose=1
    ),
    tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
]


In [None]:
# print(f'Training model with {tfhub_handle_encoder}')
# history = classifier_model.fit(x=X_train,y=y_train, validation_data=(X_test,y_test),epochs=epochs,callbacks= callbacks, verbose=True)

In [None]:
# classifier_model.save_weights(''/content/drive/MyDrive/Text-ML/checkpoint1')

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
#%tensorboard --logdir=logs

https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1


In [None]:
classifier_model.layers

[<keras.engine.input_layer.InputLayer at 0x7f9151ed3e90>,
 <tensorflow_hub.keras_layer.KerasLayer at 0x7f9151ee8b10>,
 <tensorflow_hub.keras_layer.KerasLayer at 0x7f915242de50>,
 <keras.layers.core.dropout.Dropout at 0x7f914d8d8d10>,
 <keras.layers.core.dense.Dense at 0x7f914b96d290>]

In [None]:
classifier_model.load_weights('/content/drive/MyDrive/Text-ML/checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f914d946690>

In [None]:
preprocesser_model = keras.Model(inputs=classifier_model.input,
                                       outputs=classifier_model.get_layer('preprocessing').output)

encoder_model = keras.Model(inputs=classifier_model.input,
                                       outputs=classifier_model.get_layer('BERT_encoder').output)

text_test = X_train[1]
test_tensor=tf.convert_to_tensor(text_test)
print("input:",test_tensor)

p_out=preprocesser_model(test_tensor)
text_preprocessed = bert_preprocess_model(text_test)
print(text_preprocessed["input_mask"])
stop_index=0
while(text_preprocessed["input_mask"][0][stop_index] == 1):
  stop_index+=1
print("stop id index:",stop_index)

#when using the tokenizer below it ignores the sep and clr 
input_id_list = text_preprocessed["input_word_ids"][:,1:stop_index-1]
print(input_id_list)

output = encoder_model(test_tensor)
print(output["sequence_output"].shape)
valid_entries=output["sequence_output"][:,1:stop_index-1,:]
#print(valid_entries)
a=tf.math.reduce_mean(valid_entries,-1)
print(a)
mean=tf.math.reduce_mean(a,-1,keepdims=True)
print(mean)
std=tf.math.reduce_std(a,-1,keepdims=True)
print(std)
a1=(a-mean)/std
print(a1)

input: tf.Tensor([b'In contrast generative models are trained to maximize the joint probability of the training  used transformationbased learning Brill 1995 which for the present purposes can be tought of as a classi cationbased '], shape=(1,), dtype=string)
tf.Tensor(
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 128), dtype=int32)
stop id index: 43
tf.Tensor(
[[ 1999  5688 11416  6024  4275  2024  4738  2000 25845  1996  4101  9723
   1997  1996  2731  2109  8651 15058  2094  4083  7987  8591  2786  2029
   2005  1996  2556  5682  2064  2022  7823  2102  1997  2004  1037  2465
   2072  4937  3258 15058  2094]], shape=(1, 41), dtype=int32)
(1, 128, 512)
tf.Tensor(
[[-0.01570772 -0.01725309 -0.01938405 -0.01798658 -0.01953679 -0.0186075
  -0.02059441 -0.0194

In [None]:
from official.nlp import bert 
import official.nlp.bert.tokenization

In [None]:
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file='/content/drive/MyDrive/Text-ML/vocab.txt')


In [None]:
print("Vocab size:", len(tokenizer.vocab))


Vocab size: 30522


In [None]:
tokens = tokenizer.tokenize(text_test[0]) 
t_tok=tf.convert_to_tensor(tokens)

indicies=np.ones((len(tokens)),dtype=int)
for i,tok in enumerate(tokens):
  if '##' in tok:
    indicies[i]=0
print(indicies)

tokens[3][2:]
full_words=tokens.copy()
a2=((a-mean)/std).numpy()
ix=-1
for i,tok in enumerate(tokens):
  if not indicies[i]:
    a2[0][ix]+=a2[0][i]
    full_words[ix]+=tok[2:]
  else:
    ix=i

print(full_words)

t_full=tf.boolean_mask(tf.convert_to_tensor(full_words,dtype=tf.string),indicies)
t_a=tf.boolean_mask(tf.convert_to_tensor(a2),[indicies]).numpy()
t_full
    



[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 0 0 0]
['in', 'contrast', 'generative', '##tive', 'models', 'are', 'trained', 'to', 'maximize', 'the', 'joint', 'probability', 'of', 'the', 'training', 'used', 'transformationbased', '##base', '##d', 'learning', 'brill', '##ill', '1995', 'which', 'for', 'the', 'present', 'purposes', 'can', 'be', 'tought', '##t', 'of', 'as', 'a', 'classi', '##i', 'cationbased', '##ion', '##base', '##d']


<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'in', b'contrast', b'generative', b'models', b'are', b'trained',
       b'to', b'maximize', b'the', b'joint', b'probability', b'of',
       b'the', b'training', b'used', b'transformationbased', b'learning',
       b'brill', b'1995', b'which', b'for', b'the', b'present',
       b'purposes', b'can', b'be', b'tought', b'of', b'as', b'a',
       b'classi', b'cationbased'], dtype=object)>

In [None]:
t_full[0].numpy()


"b'in'"

In [None]:
def coloring(text,fore=None,back=None):
    txt=text
    if fore != None:
      txt = "\033[38;2;{};{};{}m".format(fore[0], fore[1], fore[2])+txt
    if back != None:
      txt = "\033[48;2;{};{};{}m".format(back[0], back[1], back[2])+txt
    return txt

print(coloring('Hello',back=(255,0,0)) + coloring('Hello', back=(0,0,255)))

[48;2;255;0;0mHello[48;2;0;0;255mHello


Testing PDF extraction and writing


In [1]:
pip install PyPDF2 #package to read in

Collecting PyPDF2
  Downloading PyPDF2-1.27.12-py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 19.3 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 6.0 MB/s eta 0:00:01[K     |████████████▎                   | 30 kB 7.8 MB/s eta 0:00:01[K     |████████████████▍               | 40 kB 7.0 MB/s eta 0:00:01[K     |████████████████████▌           | 51 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████▋       | 61 kB 6.5 MB/s eta 0:00:01[K     |████████████████████████████▋   | 71 kB 6.3 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 3.2 MB/s 
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-1.27.12


In [3]:
import PyPDF2

In [8]:
path='/content/drive/MyDrive/Text-ML/phocus.pdf'
reader = PyPDF2.PdfFileReader(path)

In [9]:
reader.documentInfo

{'/CreationDate': 'D:20220117013737Z',
 '/Creator': 'LaTeX with acmart  and hyperref 2020-05-15 v7.00e Hypertext links for LaTeX',
 '/ModDate': 'D:20220117013737Z',
 '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2',
 '/Producer': 'pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2',
 '/Subject': '',
 '/Title': 'Phocus: Picking Valuable Research from a Sea of Citations',
 '/Trapped': '/False'}

In [10]:
reader.numPages
#reader.getPage(1)

7

In [15]:
reader.getPage(6).extractText() #works ok

"P ho cus: Picking Valuable Research from a Sea of Citations\n, ,\n[14]\nZhengjie Gao, Ao Feng, Xinyu Song, and Xi Wu. 2019. Target-Dep endent Senti-\nment Classi˙cation With BERT.\nIEEE Access\n7 (2019), 154290\x15154299.\n[15]\nBorja Gonzalez-Pereira, Vicente Guerrero-Bote, and Felix Moya-Anegon.\n2009. The SJR indicator: A new indicator of journals' scienti˙c prestige.\narXiv:0912.4141 [cs.DL]\n[16]\nKaiming He, Xiangyu Zhang, Shao qing Ren, and Jian Sun. 2015. De ep Residual\nLearning for Image Re cognition. arXiv:1512.03385 [cs.CV]\n[17]\nJorge E. Hirsch. 2005. An index to quantify an individual's scienti˙c research\noutput.\nPro c. Natl. Acad. Sci. USA\n102 (2005), 16569\x1516572.\n[18]\nMickel Hoang, Oskar Alija Bihorac, and Jacob o Rouces. 2019. Asp e ct-Base d\nSentiment Analysis using BERT. In\nNODALIDA\n.\n[19]\nMinghao Hu, Yuxing Peng, Zhen Huang, Dongsheng Li, and Yiwei Lv. 2019.\nOp en-Domain Targete d Sentiment Analysis via Span-Base d Extraction and Clas-\nsi˙cation. In

In [None]:
writer = PyPDF2.PdfFileWriter()

In [16]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20220319-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 7.8 MB/s 
Collecting cryptography
  Downloading cryptography-37.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 37.9 MB/s 
Installing collected packages: cryptography, pdfminer.six
Successfully installed cryptography-37.0.1 pdfminer.six-20220319


In [17]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

In [32]:
def convert_pdf_to_string(file_path):
	output_string = StringIO()
	with open(file_path, 'rb') as in_file:
	    parser = PDFParser(in_file)
	    doc = PDFDocument(parser)
	    rsrcmgr = PDFResourceManager()
	    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
	    interpreter = PDFPageInterpreter(rsrcmgr, device)
	    for page in PDFPage.create_pages(doc):
	        interpreter.process_page(page)

	return(output_string.getvalue())


In [53]:
text=convert_pdf_to_string(path)
text1 = text.replace('\x0c','')
text2 = text1.split('.\n\n')
refine=[t.replace('\n',' ') for t in text2]

In [60]:
sents=re.split('^(al)',refine[10])

In [61]:
sents

['2.1 Citation Classification In fact, there are already many kinds of research that have focused on citation classification. For example, Teufel et al',
 ' classify citation intents into 12 classes, using simple regular match to ex- tract features. Valenzuela et al',
 ' divide citations into 4 classes: highly influential, background, method and results citations, using SVM with an RBF kernel and random forests, taking 13 features into consideration: total number of direct citations, number of direct cita- tions per section, the total number of indirect citations and number of indirect citations per section, author overlap, is considered help- ful, citation appears in table and caption, 1/number of references, number of paper citations/all citations, the similarity between ab- stracts, PageRank[28], number of total citing papers after transitive closure, and field of the cited paper. While Jurgens et al',
 ' define 7 classes of citation intents: background, motivation, uses, extension,

In [62]:
pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.3.0-py3-none-any.whl (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.0 MB/s 
Collecting distro
  Downloading distro-1.7.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.7.0 tabula-py-2.3.0
