# First we have to prepare the dataset

**Importing necessary packages and loading the data**

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
## Only needed if running it on Google Colab

# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.getcwd()
# os.chdir('/content/drive/MyDrive/raw_data')

Mounted at /content/drive


In [4]:
data = pd.read_csv('../raw_data/small_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,0,subtopic
0,0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics


## 1. Preprocessing the data

**Check for duplicates**

In [5]:
duplicate_count = data.duplicated().sum()
duplicate_count

0

**Check for missing data**

In [6]:
data.isnull().sum().sort_values(ascending=False)/len(data) 

Unnamed: 0    0.0
0             0.0
subtopic      0.0
dtype: float64

**Rename columns and remove useless columns**

In [7]:
data = data.drop(columns=['Unnamed: 0'])
data = data.rename(columns={'0':'pdf_content'})

In [8]:
data

Unnamed: 0,pdf_content,subtopic
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics
...,...,...
2030,2 2 0 2 g u A 3 2 ] A F . h t a m [ 1 v 5...,Symplectic Geometry
2031,UNIVERSIDAD COMPLUTENSE DE MADRID FACULTAD DE...,Symplectic Geometry
2032,2 2 0 2 g u A 0 2 ] G S . h t a m [ 1 v 4...,Symplectic Geometry
2033,2 2 0 2 g u A 6 1 ] G S . h t a m [ 1 v 4...,Symplectic Geometry


In [None]:
# data['pdf_content'][0]

**Removing whitespace from the start and the end of each string**

In [10]:
for i in range(len(data)):
    data['pdf_content'][i] = data['pdf_content'][i].strip()
# data['pdf_content'][0]

**Turning every letter into lowercase**

In [11]:
for i in range(len(data)):
    data['pdf_content'][i] = data['pdf_content'][i].lower()
# data['pdf_content'][0]

**Removing digits**

In [12]:
for i in range(len(data)):
    data['pdf_content'][i] = ''.join(char for char in data['pdf_content'][i] if not char.isdigit())
# data['pdf_content'][0]

**Removing punctuation**

In [13]:
for i in range(len(data)):
    for punctuation in string.punctuation:
        data['pdf_content'][i] = data['pdf_content'][i].replace(punctuation, '')
# data['pdf_content'][0]

**Removing single letter words**

In [14]:
for i in range(len(data)):
    data['pdf_content'][i] = ' '.join(w for w in data['pdf_content'][i].split() if len(w)>1)
# data['pdf_content'][0]

**Removing all of the text before abstract, because that is where all of the PDFs start**

In [15]:
for i in range(len(data)):
    data['pdf_content'][i] = re.sub(r"^.+?(?=abstract)", "", data['pdf_content'][i])
    data['pdf_content'][i] = data['pdf_content'][i].lstrip('abstract')
    data['pdf_content'][i] = data['pdf_content'][i].lstrip()
# data['pdf_content'][0]

**This regular expression technically removes every character that is not lowercase alphabet(a-z), but in this case it is used to remove mathematical characters**

In [16]:
for i in range(len(data)):
    data['pdf_content'][i] = re.sub(r'[^a-z]', ' ',data['pdf_content'][i])
# data['pdf_content'][0]

**Removing multiple whitespace with one whitespace**

In [17]:
for i in range(len(data)):
    data['pdf_content'][i] = ' '.join(data['pdf_content'][i].split())
# data['pdf_content'][0]

In [18]:
data

Unnamed: 0,pdf_content,subtopic
0,we present numerical simulation results for th...,Astrophysics
1,forecasting solar energetic particles seps and...,Astrophysics
2,context the spacebased multiband astronomical ...,Astrophysics
3,context in cold and shielded environments mole...,Astrophysics
4,we report the discovery of new example of the ...,Astrophysics
...,...,...
2030,finite frames or spanning sets for nitedimensi...,Symplectic Geometry
2031,due to the emergence of symplectic geometry th...,Symplectic Geometry
2032,in this article we modify the classical floer ...,Symplectic Geometry
2033,we prove one deformation theoretic extension o...,Symplectic Geometry


In [19]:
data['subtopic'].unique()

array(['Astrophysics', 'Condensed Matter',
       'General Relativity and Quantum Cosmology',
       'High Energy Physics - Experiment',
       'High Energy Physics - Lattice',
       'High Energy Physics - Phenomenology',
       'High Energy Physics - Theory', 'Mathematical Physics',
       'Nonlinear Sciences', 'Nuclear Experiment', 'Nuclear Theory',
       'Quantum Physics', 'Algebraic Geometry', 'Algebraic Topology',
       'Analysis of PDEs', 'Category Theory',
       'Classical Analysis and ODEs', 'Combinatorics',
       'Commutative Algebra', 'Complex Variables',
       'Differential Geometry', 'Dynamical Systems',
       'Functional Analysis', 'General Mathematics', 'General Topology',
       'Geometric Topology', 'Group Theory', 'History and Overview',
       'Information Theory', 'K-Theory and Homology', 'Logic',
       'Metric Geometry', 'Number Theory', 'Numerical Analysis',
       'Operator Algebras', 'Optimization and Control', 'Probability',
       'Quantum Algebra', 'Re

In [20]:
physics_subtopics = ['Astrophysics', 
                     'Condensed Matter', 
                     'General Relativity and Quantum Cosmology',
                     'High Energy Physics - Experiment',
                     'High Energy Physics - Lattice',
                     'High Energy Physics - Phenomenology',
                     'High Energy Physics - Theory',
                     'Mathematical Physics',
                     'Nonlinear Sciences',
                     'Nuclear Experiment',
                     'Nuclear Theory',
                     'Physics',
                     'Quantum Physics']

In [21]:
data['topic'] = ''

for i in range(len(data)):
    if data['subtopic'][i] in physics_subtopics:
        data['topic'][i] = 'Physics'
    else:
        data['topic'][i] = 'Mathematics'

**Tokenizing the pdf content**

In [22]:
# Only needed if running the Notebook from Google Colab

# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
tokenized_df = data.copy()

for i in range(len(data)):
    tokenized_df['pdf_content'][i] = word_tokenize(tokenized_df['pdf_content'][i])
# tokenized_df['pdf_content'][0]

**Remove stopwords**

In [24]:
stop_words = set(stopwords.words('english'))

for i in range(len(data)):
    tokenized_df['pdf_content'][i] = [w for w in tokenized_df['pdf_content'][i] if w not in stop_words]
tokenized_df['pdf_content']

0       [present, numerical, simulation, results, prop...
1       [forecasting, solar, energetic, particles, sep...
2       [context, spacebased, multiband, astronomical,...
3       [context, cold, shielded, environments, molecu...
4       [report, discovery, new, example, rare, class,...
                              ...                        
2030    [finite, frames, spanning, sets, nitedimension...
2031    [due, emergence, symplectic, geometry, geometr...
2032    [article, modify, classical, floer, complex, c...
2033    [prove, one, deformation, theoretic, extension...
2034    [tqft, formalism, moore, tachikawa, describing...
Name: pdf_content, Length: 2035, dtype: object

**Lemmatizing the tokens**

In [25]:
lemmatized_df = tokenized_df.copy()

for i in range(len(tokenized_df)):
    lemmatized_df['pdf_content'][i] = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in lemmatized_df['pdf_content'][i]]
for i in range(len(tokenized_df)):
    lemmatized_df['pdf_content'][i] = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in lemmatized_df['pdf_content'][i]]

In [26]:
# lemmatized_df['pdf_content'][0]

## 2. Creating the ML model

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [28]:
joined_lemmatized_df = lemmatized_df.copy()

for i in range(len(lemmatized_df)):
    joined_lemmatized_df['pdf_content'][i] = ' '.join(word for word in joined_lemmatized_df['pdf_content'][i])


In [29]:
joined_lemmatized_df['pdf_content'][0]

'present numerical simulation result propagation alfv n wave charge starvation regime regime plasma density critical value require supply current wave analyze conservative scenario alfv n wave pick charge region charge density exceed critical value advect along high lorentz factor system consist alfv n wave charge carry call chargecarrying alfv n wave ccaw move medium small nonzero plasma density nd interaction ccaw stationary medium stream like instability lead emergence strong electric eld along direction unperturbed magnetic eld growth rate instability order plasma frequency medium encounter ccaw numerical code follow system hundred wave period numerical calculation suggest nal strength electric eld order percent alfv n wave amplitude little radiation produce sinusoidally oscillate current associate instability linear growth phase however nonlinear phase uctuating current density produce strong em radiation near plasma frequency limit growth instability key word fast radio burst sta

In [30]:

tk = Tokenizer()
tk.fit_on_texts(joined_lemmatized_df['pdf_content'])
sequences = tk.texts_to_sequences(joined_lemmatized_df['pdf_content'])

# print(sequences)

In [31]:
vocab_size = len(tk.word_index)
X_pad = pad_sequences(sequences, dtype='float32', padding='post')
print(X_pad.shape, vocab_size)

(2035, 45744) 182837


**Encoding the categorical column topic into binary**

In [32]:
from sklearn.preprocessing import OrdinalEncoder


y = data[['topic']]
enc = OrdinalEncoder()

y = enc.fit_transform(y)

In [33]:
y

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [34]:
### Let's build the neural network now
from tensorflow.keras import layers, Sequential

# Size of your embedding space = size of the vector representing each word
embedding_size = 10

model = Sequential()
model.add(layers.Embedding(
    input_dim=vocab_size+1,
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

model.add(layers.LSTM(10))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 10)          1828380   
                                                                 
 lstm (LSTM)                 (None, 10)                840       
                                                                 
 dense (Dense)               (None, 1)                 11        
                                                                 
Total params: 1,829,231
Trainable params: 1,829,231
Non-trainable params: 0
_________________________________________________________________


In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y, test_size=0.3, random_state=1)

In [36]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# history = model.fit(X_train, y_train, epochs=5, batch_size=16, verbose=1)

In [37]:
# model.evaluate(X_test, y_test)

In [38]:
import matplotlib.pyplot as plt

def plot_history(history):
    plt.plot(history.history['loss'])
    plt.title('Train loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.show()

In [39]:
# plot_history(history)

In [40]:
# model.predict(np.expand_dims(X_test[9], axis=0))

## 3. Creating models for the Subtopics

In [41]:
math_df = joined_lemmatized_df.copy()
physics_df = joined_lemmatized_df.copy()

In [42]:
math_df

Unnamed: 0,pdf_content,subtopic,topic
0,present numerical simulation result propagatio...,Astrophysics,Physics
1,forecast solar energetic particle sep identify...,Astrophysics,Physics
2,context spacebased multiband astronomical vari...,Astrophysics,Physics
3,context cold shield environment molecule freez...,Astrophysics,Physics
4,report discovery new example rare class highly...,Astrophysics,Physics
...,...,...,...
2030,finite frame span set nitedimensional hilbert ...,Symplectic Geometry,Mathematics
2031,due emergence symplectic geometry geometric tr...,Symplectic Geometry,Mathematics
2032,article modify classical floer complex cf pair...,Symplectic Geometry,Mathematics
2033,prove one deformation theoretic extension grom...,Symplectic Geometry,Mathematics


In [43]:
physics_df['subtopic'].value_counts()

(2035, 3)

In [None]:
math_df['subtopic'].value_counts()

In [44]:
# Summarizing the subtopics to have fewer subtopics
for i in range(len(math_df)):
    if math_df['subtopic'][i] in ['Algebraic Geometry', 
                                  'Algebraic Topology',
                                  'Commutative Algebra',
                                  'Differential Geometry',
                                  'General Topology',
                                  'Geometric Topology',
                                  'Group Theory',
                                  'Symplectic Geometry',
                                  'Representation Theory',
                                  'K-Theory and Homology',
                                  'Category Theory',
                                  'Quantum Algebra',
                                  'Spectral Theory',
                                  'Rings and Algebras',
                                  'Operator Algebras']:
        math_df['subtopic'][i] = 'Algebra, Geometry and Topology'
    elif math_df['subtopic'][i] in ['Analysis of PDEs', 
                                    'Classical Analysis and ODEs',
                                    'Functional Analysis',
                                    'General Mathematics',
                                    'Numerical Analysis']:
        math_df['subtopic'][i] = 'Classic and Numerical Analysis'
    elif math_df['subtopic'][i] in ['Dynamical Systems', 
                                    'Logic',
                                    'Optimization and Control']:
        math_df['subtopic'][i] = 'Optimization'
    elif math_df['subtopic'][i] in ['Combinatorics', 
                                    'Probability',
                                    'Statistics Theory']:
        math_df['subtopic'][i] = 'Statistics and Probability'
    elif math_df['subtopic'][i] in ['Mathematical Physic', 
                                    'Information Theory']:
        math_df['subtopic'][i] = 'Cross-disciplines'

In [45]:
math_df = math_df[math_df.topic == 'Mathematics'].reset_index()
math_df = math_df.drop(columns=['index'])

physics_df = physics_df[physics_df.topic == 'Physics'].reset_index()
physics_df = physics_df.drop(columns=['index'])

### 3.1 Creating the Math model

In [46]:
math_df

Unnamed: 0,pdf_content,subtopic,topic
0,obtain list automorphism group smooth plane se...,"Algebra, Geometry and Topology",Mathematics
1,concept algebraic geometry commutative algebra...,"Algebra, Geometry and Topology",Mathematics
2,explore maximum likelihood degree homogeneous ...,"Algebra, Geometry and Topology",Mathematics
3,la memoire alberto collino prove rationally co...,"Algebra, Geometry and Topology",Mathematics
4,let polarization connect kernel super special ...,"Algebra, Geometry and Topology",Mathematics
...,...,...,...
1091,finite frame span set nitedimensional hilbert ...,"Algebra, Geometry and Topology",Mathematics
1092,due emergence symplectic geometry geometric tr...,"Algebra, Geometry and Topology",Mathematics
1093,article modify classical floer complex cf pair...,"Algebra, Geometry and Topology",Mathematics
1094,prove one deformation theoretic extension grom...,"Algebra, Geometry and Topology",Mathematics


In [47]:
from sklearn.preprocessing import OneHotEncoder

enc2 = OneHotEncoder(sparse = False, handle_unknown='ignore')
y_math_cat = enc2.fit_transform(math_df[['subtopic']])
new_column_names1 = enc2.get_feature_names_out()
y_math_cat = pd.DataFrame(y_math_cat)
y_math_cat.columns = new_column_names1

In [48]:
y_math_cat.tail(20)

Unnamed: 0,"subtopic_Algebra, Geometry and Topology",subtopic_Classic and Numerical Analysis,subtopic_Complex Variables,subtopic_Cross-disciplines,subtopic_History and Overview,subtopic_Metric Geometry,subtopic_Number Theory,subtopic_Optimization,subtopic_Statistics and Probability
1076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [49]:
X = math_df['pdf_content']

X_math_train, X_math_test, y_math_train, y_math_test = train_test_split(
    X, y_math_cat, test_size=0.3, random_state=1)

In [50]:
tk = Tokenizer()
tk.fit_on_texts(X_math_train)
sequences = tk.texts_to_sequences(X_math_train)

In [51]:
vocab_size = len(tk.word_index)
X_pad = pad_sequences(sequences, dtype='float32', padding='post')
print(X_pad.shape, vocab_size)

(767, 36372) 91277


In [52]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 25

math_model = Sequential()
math_model.add(layers.Embedding(
    input_dim=vocab_size+1,
    output_dim=embedding_size,
    mask_zero=True, # Built-in masking layer :)
))

math_model.add(layers.LSTM(20))
math_model.add(layers.Dense(9, activation="softmax"))
math_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 25)          2281950   
                                                                 
 lstm_1 (LSTM)               (None, 20)                3680      
                                                                 
 dense_1 (Dense)             (None, 9)                 189       
                                                                 
Total params: 2,285,819
Trainable params: 2,285,819
Non-trainable params: 0
_________________________________________________________________


In [None]:
math_model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history = math_model.fit(X_pad, y_math_train, epochs=15, validation_split=0.3, batch_size=16, verbose=1)

Epoch 1/15


In [None]:
sequences2 = tk.texts_to_sequences(X_math_test)
X_pad2 = pad_sequences(sequences2, dtype='float32', padding='post')

In [None]:
math_model.evaluate(X_pad2,y_math_test)

### 3.2 Creating the Physics model

In [None]:
physics_df

In [None]:
for i in range(len(physics_df)):
  if physics_df['subtopic'][i] in ['High Energy Physics - Experiment',
                                   'High Energy Physics - Lattice',
                                   'High Energy Physics - Phenomenology',
                                   'High Energy Physics - Theory']:
                                   physics_df['subtopic'][i] = 'High Energy Physics'
  elif physics_df['subtopic'][i] in ['Nuclear Experiment', 'Nuclear Theory']:
    physics_df['subtopic'][i] = 'Nuclear Physics'

In [None]:
enc3 = OneHotEncoder(sparse = False, handle_unknown='ignore')
y_phys = enc3.fit_transform(physics_df[['subtopic']])
new_column_names2 = enc3.get_feature_names_out()
y_phys = pd.DataFrame(y_phys)
y_phys.columns = new_column_names2

In [None]:
y_phys

In [None]:
X = physics_df['pdf_content']

X_phys_train, X_phys_test, y_phys_train, y_phys_test = train_test_split(
    X, y_phys, test_size=0.3, random_state=1)

In [None]:
tk = Tokenizer()
tk.fit_on_texts(X_phys_train)
sequences3 = tk.texts_to_sequences(X_phys_train)

In [None]:
vocab_size2 = len(tk.word_index)
X_pad_phys = pad_sequences(sequences3, dtype='float32', padding='post')
print(X_pad_phys.shape, vocab_size)

In [None]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 25

phys_model = Sequential()
phys_model.add(layers.Embedding(
    input_dim=vocab_size+1,
    output_dim=embedding_size,
    mask_zero=True, # Built-in masking layer :)
))

phys_model.add(layers.LSTM(20))
phys_model.add(layers.Dense(8, activation="softmax"))
phys_model.summary()

In [None]:
phys_model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history = phys_model.fit(X_pad_phys, y_phys_train, epochs=15, validation_split=0.3, batch_size=16, verbose=1)

In [None]:
sequences4 = tk.texts_to_sequences(X_phys_test)
X_pad_phys2 = pad_sequences(sequences4, dtype='float32', padding='post')

In [None]:
phys_model.evaluate(X_pad_phys2, y_phys_test)