# Loading the data + importing basic packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [3]:
## Only needed if running it on Google Colab ##

# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.getcwd()
# os.chdir('/content/drive/MyDrive/raw_data')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('../raw_data/data_3k.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,text,topic,subtopic
0,0,https://openalex.org/W2100716186,Large-scale screening of hypothetical metal-or...,PUBLISHED ONLINE: 6 NOVEMBER 2011 | DOI: 10.10...,Chemistry,Nanotechnology
1,1,https://openalex.org/W2109370530,Routing and wavelength assignment in all-optic...,"[H+./ACM TRANSACTIONS ON NETWORKING, VO1. 3, N...",Computer science,Telecommunications
2,2,https://openalex.org/W2131625303,Potassium channel antibody-associated encephal...,"DOI: 10.1093/brain/awh077 Brain (2004), 127, ...",Medicine,Pediatrics
3,3,https://openalex.org/W2131736388,"MODBASE, a database of annotated comparative p...","D336–D346 Nucleic Acids Research, 2014, Vol. 4...",Computer science,Bioinformatics
4,4,https://openalex.org/W2144966005,Regional brain volume abnormalities and long-t...,ORIGINAL CONTRIBUTION Regional Brain Volume A...,Medicine,Internal medicine


## 1. Preprocessing

In [5]:
data = data.drop(columns='Unnamed: 0')

In [6]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))

def preprocessing(pdf):
    # Turn all text to lowercase
    pdf = str(pdf).lower()
    
    # Removing anything apart from lower case letters
    pdf = re.sub(r'[^a-z]', ' ', str(pdf))
    
    # Removing everything before the abstract
    pdf = re.sub(r"^.+?(?=abstract)", "", str(pdf))
    pdf = pdf.lstrip('abstract')
    pdf = pdf.lstrip()
    
    # Removing single letter words
    pdf = ' '.join(w for w in pdf.split() if len(w)>1)
    
    # Removing whitespace
    pdf = pdf.strip()

    # To save computational costs, we will implement a longest_token_size here
    longest_token_size = 1000 
    num_of_words_to_keep = int(longest_token_size*5) # adding some margin, because in spotword removal and word2vec embedding some will be removed
    pdf = ' '.join(pdf.split()[:num_of_words_to_keep])
    
    # Turn words into numerical tokens and remove stopwords
    word_tokens = word_tokenize(pdf)
    pdf = [w for w in word_tokens if not w in stop_words]
    
    # Lemmatizing words(Turning each verb and noun into its core)
    pdf = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in pdf]
    pdf = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in pdf]
    return ' '.join(pdf)

In [7]:
data['clean_text'] = data['text'].map(lambda x: preprocessing(x))
data

Unnamed: 0,id,title,text,topic,subtopic,clean_text
0,https://openalex.org/W2100716186,Large-scale screening of hypothetical metal-or...,PUBLISHED ONLINE: 6 NOVEMBER 2011 | DOI: 10.10...,Chemistry,Nanotechnology,publish online november doi nchem article larg...
1,https://openalex.org/W2109370530,Routing and wavelength assignment in all-optic...,"[H+./ACM TRANSACTIONS ON NETWORKING, VO1. 3, N...",Computer science,Telecommunications,acm transaction network vo october rout wavele...
2,https://openalex.org/W2131625303,Potassium channel antibody-associated encephal...,"DOI: 10.1093/brain/awh077 Brain (2004), 127, ...",Medicine,Pediatrics,doi brain awh brain potassium channel antibody...
3,https://openalex.org/W2131736388,"MODBASE, a database of annotated comparative p...","D336–D346 Nucleic Acids Research, 2014, Vol. 4...",Computer science,Bioinformatics,modbase http salilab org modbase database anno...
4,https://openalex.org/W2144966005,Regional brain volume abnormalities and long-t...,ORIGINAL CONTRIBUTION Regional Brain Volume A...,Medicine,Internal medicine,original contribution regional brain volume ab...
...,...,...,...,...,...,...
2915,https://openalex.org/W2122538288,Tissue-specific expression and regulation of s...,"The Plant Cell, Vol. 5, 9-23, January 1993...",Biology,Genetics,plant cell vol january american society plant ...
2916,https://openalex.org/W2133765154,Interleukin-23 drives innate and T cell–mediat...,Downloaded from genesdev.cshlp.org on Sept...,Medicine,Immunology,download genesdev cshlp org september publish ...
2917,https://openalex.org/W2142740946,Isolation with Migration Models for More Than ...,Energy & Environmental Science PAPER Entropi...,Biology,Evolutionary biology,energy environmental science paper entropic st...
2918,https://openalex.org/W2152560313,The Impact of Electronic Health Records on Tim...,"Coffelt, S. B., Wellenstein, M. D., and de Vis...",Medicine,Medical emergency,offelt wellenstein de visser neutrophil cancer...


**Filtering the topics and the subtopics**

In [8]:
# Filtering for those topics that occurr more commonly in our data        <----- percentile can be adjusted
common_topics = (data['topic'].value_counts() > np.percentile(data['topic'].value_counts(), 25)) # topic occurrence until 25th percentile
filtered_topics = common_topics[common_topics == True].index

# Filtering for those subtopics that occurr more commonly in our data     <----- instead of cutting of at the mean, can also be adjusted like above
# common_subtopics = (data['subtopic'].value_counts() > data['subtopic'].value_counts().mean())
# filtered_subtopics = common_subtopics[common_subtopics == True].index

# Filtering data according to the topics and subtopics that are more common
data = data[data['topic'].isin(list(filtered_topics))]

In [9]:
data['topic'].value_counts()

Biology                  940
Medicine                 701
Chemistry                259
Psychology               235
Computer science         231
Physics                  113
Materials science        102
Mathematics               78
Environmental science     70
Economics                 52
Business                  42
Geology                   39
Geography                 21
Sociology                 13
Name: topic, dtype: int64

In [10]:
num_of_topics = common_topics.value_counts()[1]
num_of_topics

14

## 2. Creating the models

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras import layers, Sequential

In [12]:
topic_df = data.copy()

**Checking the subtopics for each topic**

In [13]:
from collections import Counter

print(Counter(data[data['topic'] == 'Physics']['subtopic']))

Counter({'Astronomy': 32, 'Quantum mechanics': 8, 'Classical mechanics': 7, 'Astrophysics': 6, 'Theoretical physics': 5, 'Neuroscience': 4, 'Computational physics': 4, 'Mechanics': 4, 'Condensed matter physics': 4, 'Statistical physics': 3, 'Nuclear physics': 3, 'Optics': 3, 'Geometry': 2, 'Systems engineering': 2, 'Atomic physics': 2, 'Mathematical physics': 2, 'Astrobiology': 2, 'Quantum electrodynamics': 2, 'Particle physics': 2, 'Biological system': 2, 'Chemical physics': 1, 'Electrical engineering': 1, 'Simulation': 1, 'Applied mathematics': 1, 'Electronic engineering': 1, 'Nuclear medicine': 1, 'Optoelectronics': 1, 'Nanotechnology': 1, 'Molecular physics': 1, 'Physical medicine and rehabilitation': 1, 'Geochemistry': 1, 'Pure mathematics': 1, 'Meteorology': 1, 'Thermodynamics': 1})


### 2.1 Creating the main model for the topics

**Encoding the target, splitting the data into train/test and padding**

In [14]:
enc = OneHotEncoder(sparse = False, handle_unknown='ignore')
y_topic = enc.fit_transform(topic_df[['topic']])
new_column_names = enc.get_feature_names_out()
y_topic = pd.DataFrame(y_topic)
y_topic.columns = new_column_names

In [15]:
X = topic_df['clean_text']

X_topic_train, X_topic_test, y_topic_train, y_topic_test = train_test_split(
    X, y_topic, test_size=0.3, random_state=1)

**Rebalancing the dataset by giving each category weighting based on its occurrence**

In [16]:
from sklearn.utils.class_weight import compute_class_weight

y_integers = np.argmax(np.array(y_topic_train), axis=1)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_integers), y=y_integers)
y_topic_class_weights = dict(enumerate(class_weights))
y_topic_class_weights

{0: 0.2197051810101886,
 1: 4.670506912442396,
 2: 0.7868788819875776,
 3: 0.8618197278911565,
 4: 3.619642857142857,
 5: 3.080547112462006,
 6: 20.683673469387756,
 7: 4.9926108374384235,
 8: 1.9833659491193738,
 9: 2.5854591836734695,
 10: 0.30038529934795494,
 11: 1.7656794425087108,
 12: 0.8992901508429458,
 13: 18.098214285714285}

In [17]:
tk = Tokenizer()
tk.fit_on_texts(X_topic_train)
sequences = tk.texts_to_sequences(X_topic_train)

In [18]:
topic_vocab_size = len(tk.word_index)
X_pad_topic = pad_sequences(sequences, dtype='float32', padding='post', maxlen=1500)
print(X_pad_topic.shape, topic_vocab_size)

(2027, 1500) 98258


**Building the Recurrent neural network**

In [29]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 50

topic_model = Sequential()
topic_model.add(layers.Embedding(
    input_dim=topic_vocab_size+1,
    output_dim=embedding_size,
    mask_zero=True, # Built-in masking layer :)
))

topic_model.add(layers.LSTM(20))
topic_model.add(layers.Dense(num_of_topics, activation="softmax"))
topic_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 50)          4912950   
                                                                 
 lstm_2 (LSTM)               (None, 20)                5680      
                                                                 
 dense_2 (Dense)             (None, 14)                294       
                                                                 
Total params: 4,918,924
Trainable params: 4,918,924
Non-trainable params: 0
_________________________________________________________________


**Compiling and training the model**

In [20]:
# Defining the F1 score manually
from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [21]:
y_topic_train.shape

(2027, 14)

In [30]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5,restore_best_weights=True, monitor='val_accuracy')

topic_model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy',f1_m])

history = topic_model.fit(X_pad_topic, y_topic_train, 
                          class_weight=y_topic_class_weights, 
                          epochs=20, validation_split=0.3, batch_size=32, 
                          verbose=1, callbacks=[es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


**Padding the test set and evaluating it**

In [31]:
sequences2 = tk.texts_to_sequences(X_topic_test)
X_pad_topic2 = pad_sequences(sequences2, dtype='float32', padding='post')

In [32]:
topic_model.evaluate(X_pad_topic2,y_topic_test)



[2.612478017807007, 0.11737629771232605, 0.0]

### 2.2 Creating the second model for the subtopics 