In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
ps = PorterStemmer()
stopwords=['the','of','and',  'in',  'is',  'to',  'for',  'we',  'are',  'that',  'with',  'this',  'on',  'by',  'an',  'as',  'be',  'which',  
'it',  'problem',  'two',  'from',  'can',  'results',  'paper',  'method',  'these',  'some',  'also',  'model',  'based',  'at',  'one',  
'show',  'such',  'using',  'or',  'has',  'time',  'system',  'order',  'new',  'solution',  'not',  'have',  'set',  'function',  'if',  'all',  
'our',  'finite',  'space',  'algorithm',  'its',  'number',  'solutions',  'problems',  'used',  'between',  'given',  'equations',  'where',  
'under',  'prove',  'functions',  'proposed',  'non',  'case',  'when',  'paper,',  'conditions',  'their',  'then',  'dimensional',  'class',  
'first',  'theory',  'general',  'well', 'other', 'models', 'may',  'were', 'they', 'so', 'et',   'al', 'no', 'very', 'those',   'due', 'however', 
'di',    'moreover', 'here', 'i.e.']
def preprocess_text(text):
    text =re.sub("\$\$.*?\$\$", "", text)
    text =re.sub(r"http[^ ]*", "", text)
    text =re.sub("\$.*?\$", "", text)
    text =re.sub("\\\\\(.*?\\\\\)", "", text)
    text =re.sub("\\\\\[.*?\\\\\]", "", text)
    text =re.sub("\[.*?\]", "", text)
    text =re.sub("{.*?}", "", text)
    text =re.sub(r"\\begin.*?\\end", "", text)
    text=text.lower()
    text = re.sub("[^a-z \-]", "", text)
    text= word_tokenize(text)
    text = [word for word in text if not word in stopwords] # remove stopwords
    text=[ps.stem(word) for word in text]
    text=" ".join(text)
    text=' '.join([item for item in  text.split(' ') if len(item) >= 2])
    return text
def preprocess_keywords(text):
    text=text.lower()
    text = re.sub("[^a-z \-]", "", text)
    text=' '.join([item for item in  text.split(' ') if len(item) >= 2])
    return text

In [None]:
with open("springer_train_processed.jsonl", "rt") as infile:
    import json
    train = []
    for line in infile:
        train.append(json.loads(line))
with open("springer_valid_processed.jsonl", "rt") as infile:
    import json
    valid = []
    for line in infile:
        valid.append(json.loads(line))
with open("springer_test_processed.jsonl", "rt") as infile:
    import json
    test = []
    for line in infile:
        test.append(json.loads(line))
journals = list(set([item["journal"] for item in train]))

In [None]:

import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# extracting features
#df is data
#name is one of Abstract, Title, Keywords
def chi(dataset,feature, categories): 
    
    vocabulary=[]         #Dictionary contains chi-value of every word belong to certain category 

    for category in categories:
      #print(i)
      doc_cate, doc_not_cate = [], []
      for item in dataset:
        if item['journal'] == category: doc_cate.append(item[feature])
        else: doc_not_cate.append(item[feature])

      counter_cate =  CountVectorizer(binary=True,lowercase = True,ngram_range=(1,1))             
      X_dtm = counter_cate.fit_transform(doc_cate)
      counter_not_cate= CountVectorizer(binary=True,lowercase = True,ngram_range=(1,1),vocabulary=counter_cate.get_feature_names())  
      
      A = np.sum(X_dtm.toarray(),axis=0)
      B = counter_not_cate.fit_transform(doc_not_cate)
      B = np.sum(B.toarray(),axis=0)
      n_item_cate = sum(item["journal"] == category for item in dataset)
      C=n_item_cate*np.ones([1,len(A)])
      D=(len(dataset) - n_item_cate)*np.ones([1,len(B)])
      C=C[0]-A
      D=D[0]-B
      result=((A+B+C+D)*(A*D-B*C)**2)/((A+C)*(B+D)*(A+B)*(C+D))
     
      vocabulary.append(list(zip(counter_cate.get_feature_names(),result)))
      del doc_cate,doc_not_cate,counter_cate,counter_not_cate,X_dtm,A,B,C,D,result
    return vocabulary

def get_top_vocabulary(categories, chisquares, topk=50, threshold=200):
    
    vocabulary=[]
    for ii in range(len(categories)):
        for word, val in sorted(chisquares[ii], key=lambda item: item[1],reverse=True)[:topk]:
            if val > threshold:
                vocabulary.append(word)
    vocabulary= list(set(vocabulary))
    return vocabulary


In [None]:
def tf_idf(train,valid, test,feature,vocabuary): #name is a string
    tfidf = TfidfVectorizer(vocabulary=vocabuary, ngram_range=(1,1), dtype=np.float32)
    tfidf.fit([item[feature] for item in train])
    
    Vector_Train=tfidf.transform([item[feature] for item in train])
    Vector_train=Vector_Train.toarray()

    Vector_Test=tfidf.transform([item[feature] for item in test])
    Vector_test=Vector_Test.toarray()

    Vector_Valid=tfidf.transform([item[feature] for item in valid])
    Vector_valid=Vector_Valid.toarray()
    return Vector_train, Vector_valid, Vector_test

In [None]:
import pickle
with open("springer_abstract_chisquares.pickle", "rb") as infile:
    chisquares = pickle.load(infile)

vocab = get_top_vocabulary(journals, chisquares,topk =50, threshold=0.0)
len(vocab)
train_abstract, valid_abstract, test_abstract = tf_idf(train,valid,test,"abstract",vocab)

In [None]:
springer_labels = [
  'environmental modeling & assessment',
  'korean journal of computational and applied mathematics',
  'annali dell’università di ferrara',
  'differential equations and dynamical systems',
  'journal of applied and industrial mathematics',
  'set-valued analysis',
  'geometric & functional analysis gafa',
  'computational particle mechanics',
  'information systems frontiers',
  'computational and applied mathematics',
  'semigroup forum',
  'moscow university mathematics bulletin',
  'computational mechanics',
  'educational studies in mathematics',
  'applied mathematics',
  'memetic computing',
  'nonrenewable resources',
  'telecommunication systems',
  'annals of operations research',
  'journal of automated reasoning',
  'quarterly journal of the belgian, french and italian operations research societies',
  'integral equations and operator theory',
  'computing and visualization in science',
  'allgemeines statistisches archiv',
  'logica universalis',
  'proceedings of the steklov institute of mathematics',
  'acta applicandae mathematica',
  'fuzzy optimization and decision making',
  'evolutionary intelligence',
  'journal of geometry',
  'rendiconti del circolo matematico di palermo',
  'opsearch',
  'mathematics in computer science',
  'automation and remote control',
  'top',
  'bulletin of the malaysian mathematical sciences society',
  'mathematical models and computer simulations',
  'journal of optimization theory and applications',
  'minds and machines',
  'mathematics of control, signals and systems',
  'journal of soviet mathematics',
  'queueing systems',
  'racsam - revista de la real academia de ciencias exactas, fisicas y naturales. serie a. matematicas',
  'calcolo',
  'potential analysis',
  'doklady mathematics',
  'inventiones mathematicae',
  "publications mathématiques de l'institut des hautes études scientifiques",
  'operations-research-spektrum',
  'nonlinear differential equations and applications nodea',
  'p-adic numbers, ultrametric analysis, and applications',
  'sema journal',
  'journal of fourier analysis and applications',
  'arabian journal of mathematics',
  'analysis and mathematical physics',
  'mediterranean journal of mathematics',
  'computational optimization and applications',
  'siberian advances in mathematics',
  'journal of algebraic combinatorics',
  'the journal of the astronautical sciences',
  'general relativity and gravitation',
  'environmentalist',
  'foundations of computational mathematics',
  'revista matemática complutense',
  'science in china series a: mathematics',
  'annals of combinatorics',
  'ricerche di matematica',
  'numerical algorithms',
  'structural optimization',
  'journal of theoretical probability',
  'algebra and logic',
  'algebra universalis',
  'theoretical and mathematical physics',
  'russian mathematics',
  'communications in mathematical physics',
  'mathematical programming computation',
  'journal of global optimization',
  'annali di matematica pura ed applicata',
  'letters in mathematical physics',
  'jahresbericht der deutschen mathematiker-vereinigung',
  'statistical inference for stochastic processes',
  'zdm',
  'calculus of variations and partial differential equations',
  'journal of control theory and applications',
  'statistische hefte',
  'vietnam journal of mathematics',
  'mathematical programming',
  'energy systems',
  'boletín de la sociedad matemática mexicana',
  'journal of evolution equations',
  'journal of nonlinear science',
  'international journal of applied and computational mathematics',
  'lobachevskii journal of mathematics',
  'mathematics and financial economics',
  'complex analysis and operator theory',
  'computational statistics',
  'metrika',
  'computational complexity',
  'unternehmensforschung',
  'annales des télécommunications',
  'foundations of science',
  'experimental economics',
  'optimization and engineering',
  'operational research',
  'computational geosciences',
  'journal of fixed point theory and applications',
  'discrete event dynamic systems',
  'advances in applied clifford algebras',
  'collectanea mathematica',
  'computational methods and function theory',
  'international journal of game theory',
  'rendiconti del seminario matematico e fisico di milano',
  'combinatorica',
  'computational mathematics and mathematical physics',
  'acta mathematica sinica',
  'annals of finance',
  'journal of combinatorial optimization',
  'neural computing & applications',
  'japan journal of applied mathematics',
  'mathematical sciences',
  'dynamic games and applications',
  'cryptography and communications',
  'constraints',
  'advances in computational mathematics',
  'analysis mathematica',
  'applied mathematics and mechanics',
  'engineering with computers',
  'beiträge zur algebra und geometrie / contributions to algebra and geometry',
  'journal of engineering mathematics',
  'journal d’analyse mathématique',
  'european actuarial journal',
  'journal of scheduling',
  'annals of mathematics and artificial intelligence',
  'mathematische zeitschrift',
  'international journal of fuzzy systems',
  'journal of scientific computing',
  'zeitschrift für nationalökonomie',
  'modeling earth systems and environment',
  'numerische mathematik',
  'journal of dynamical and control systems',
  'theoretical and computational fluid dynamics',
  'interdisciplinary sciences: computational life sciences',
  'acta mathematica vietnamica',
  'journal of statistical theory and practice',
  'soviet applied mechanics',
  'discrete & computational geometry',
  'the ramanujan journal',
  'positivity',
  'mathematische annalen',
  'qualitative theory of dynamical systems',
  'regular and chaotic dynamics',
  'journal of cryptology',
  'israel journal of mathematics',
  'journal of mathematical biology',
  'social network analysis and mining',
  'results in mathematics',
  'journal of heuristics',
  'annales henri poincaré',
  'journal of systems science and complexity',
  'multibody system dynamics',
  'soft computing',
  'mathematical physics, analysis and geometry',
  'journal of mathematical imaging and vision',
  'selecta mathematica',
  'kn - journal of cartography and geographic information',
  'journal of dynamics and differential equations',
  'periodica mathematica hungarica',
  'computational management science',
  'journal of the operations research society of china',
  "bollettino dell'unione matematica italiana",
  'siberian mathematical journal',
  'numerical analysis and applications',
  'the journal of geometric analysis',
  'journal of quantitative economics',
  'computational mathematics and modeling',
  'mathematical notes of the academy of sciences of the ussr',
  'european journal of mathematics',
  'transformation groups',
  'cybernetics',
  'quantum information processing',
  'monatshefte für mathematik und physik',
  'afrika matematika',
  'archiv für mathematische logik und grundlagenforschung',
  'optimization letters',
  'economic theory bulletin',
  'constructive approximation',
  'functional analysis and its applications',
  'theory in biosciences',
  'journal of pseudo-differential operators and applications',
  'stochastic hydrology and hydraulics',
  'moscow university computational mathematics and cybernetics',
  'theory and decision',
  'vestnik st. petersburg university: mathematics',
  'bit numerical mathematics',
  'applied mathematics and optimization',
  'celestial mechanics']

journal_to_idx = {journal:ii for ii, journal in enumerate(springer_labels)}

y_train = np.asarray([journal_to_idx[item["journal"].lower()] for item in train])
y_valid = np.asarray([journal_to_idx[item["journal"].lower()] for item in valid])
y_test = np.asarray([journal_to_idx[item["journal"].lower()] for item in test])

In [None]:
import numpy as np

def get_accuracy(y_true, y_pred):
    top1 = np.mean(np.repeat(y_true, 1).reshape(-1,1) == np.argsort(y_pred, axis=-1)[:,:-2:-1])*1
    top3 = np.mean(np.repeat(y_true, 3).reshape(-1,3) == np.argsort(y_pred, axis=-1)[:,:-4:-1])*3
    top5 = np.mean(np.repeat(y_true, 5).reshape(-1,5) == np.argsort(y_pred, axis=-1)[:,:-6:-1])*5
    top10 = np.mean(np.repeat(y_true, 10).reshape(-1,10) == np.argsort(y_pred, axis=-1)[:,:-11:-1])*10
    return [top1, top3, top5, top10]
def softmax_func(logits):
    return np.exp(logits) / np.sum(np.exp(logits), axis=1).reshape(-1,1)

# train_softmaxes = []
valid_softmaxes = []
test_softmaxes = []
features = "abstract"
for model in ['scibert', 'xlnet', 'electra']:
    softmax = np.load("softmaxes/springer-{}-{}-valid.npy".format(model, features)).astype(np.float32)
    if model in ['xlnet', 'electra']: 
        softmax = softmax_func(softmax)
    valid_softmaxes.append(softmax)

    softmax = np.load("softmaxes/springer-{}-{}-test.npy".format(model, features)).astype(np.float32)
    if model in ['xlnet', 'electra']: 
        softmax = softmax_func(softmax)
    test_softmaxes.append(softmax)
    

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda, Concatenate, Reshape, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

input_tfidf = Input(shape=(train_abstract.shape[1], ))
xx = Dense(units=300, activation='relu')(input_tfidf)
transfer_softmax = Dense(units=196, activation='softmax')(xx)
transfer_model = Model(inputs=[input_tfidf], outputs=transfer_softmax)

transfer_model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

transfer_model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 7534)]            0         
_________________________________________________________________
dense_12 (Dense)             (None, 300)               2260500   
_________________________________________________________________
dense_13 (Dense)             (None, 196)               58996     
Total params: 2,319,496
Trainable params: 2,319,496
Non-trainable params: 0
_________________________________________________________________


In [None]:
class ModelCheckpoint(tf.keras.callbacks.Callback):

    def __init__(self, max_no_improvements=20):
        super(ModelCheckpoint, self).__init__()
        self.best_val_acc = 0.0
        self.best_weights = None
        self.max_no_improvements = max_no_improvements
        self.n_no_improvements = 0

    def on_epoch_end(self, epoch, logs):
        val_acc = logs["val_acc"]
        if self.best_weights is None or val_acc >= self.best_val_acc:
            self.best_weights = self.model.get_weights()
            self.best_val_acc = val_acc
            self.n_no_improvements = 0
        else:
            self.n_no_improvements += 1
            if self.n_no_improvements > self.max_no_improvements:
                self.model.stop_training = True

    def on_train_end(self, logs=None):
        self.model.set_weights(self.best_weights)

Train on 193892 samples, validate on 14945 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000


<tensorflow.python.keras.callbacks.History at 0x7f7b77429ac8>

In [None]:
transfer_model.fit([train_abstract], 
          y_train, 
          epochs=1000, 
          batch_size=128, 
          callbacks= [ModelCheckpoint()],
          validation_data=([valid_abstract], y_valid))
# get_accuracy(y_train, transfer_model.predict([train_abstract]))