# First we have to prepare the dataset

**Importing necessary packages and loading the data**

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
import pandas as pd
import numpy as np
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
data = pd.read_csv('../raw_data/small_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,0,subtopic
0,0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics


## 1. Preprocessing the data

**Check for duplicates**

In [7]:
duplicate_count = data.duplicated().sum()
duplicate_count

0

**Check for missing data**

In [8]:
data.isnull().sum().sort_values(ascending=False)/len(data) 

Unnamed: 0    0.0
0             0.0
subtopic      0.0
dtype: float64

**Rename columns and remove useless columns**

In [9]:
data = data.drop(columns=['Unnamed: 0'])
data = data.rename(columns={'0':'pdf_content'})

In [10]:
data

Unnamed: 0,pdf_content,subtopic
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics
...,...,...
2030,2 2 0 2 g u A 3 2 ] A F . h t a m [ 1 v 5...,Symplectic Geometry
2031,UNIVERSIDAD COMPLUTENSE DE MADRID FACULTAD DE...,Symplectic Geometry
2032,2 2 0 2 g u A 0 2 ] G S . h t a m [ 1 v 4...,Symplectic Geometry
2033,2 2 0 2 g u A 6 1 ] G S . h t a m [ 1 v 4...,Symplectic Geometry


In [None]:
data['pdf_content'][0]

**Removing whitespace from the start and the end of each string**

In [12]:
for i in range(len(data)):
    data['pdf_content'][i] = data['pdf_content'][i].strip()
# data['pdf_content'][0]

**Turning every letter into lowercase**

In [13]:
for i in range(len(data)):
    data['pdf_content'][i] = data['pdf_content'][i].lower()
# data['pdf_content'][0]

**Removing digits**

In [14]:
for i in range(len(data)):
    data['pdf_content'][i] = ''.join(char for char in data['pdf_content'][i] if not char.isdigit())
# data['pdf_content'][0]

**Removing punctuation**

In [15]:
for i in range(len(data)):
    for punctuation in string.punctuation:
        data['pdf_content'][i] = data['pdf_content'][i].replace(punctuation, '')
# data['pdf_content'][0]

**Removing single letter words**

In [16]:
for i in range(len(data)):
    data['pdf_content'][i] = ' '.join(w for w in data['pdf_content'][i].split() if len(w)>1)
# data['pdf_content'][0]

**Removing all of the text before abstract, because that is where all of the PDFs start**

In [17]:
for i in range(len(data)):
    data['pdf_content'][i] = re.sub(r"^.+?(?=abstract)", "", data['pdf_content'][i])
    data['pdf_content'][i] = data['pdf_content'][i].lstrip('abstract')
    data['pdf_content'][i] = data['pdf_content'][i].lstrip()
# data['pdf_content'][0]

**This regular expression technically removes every character that is not lowercase alphabet(a-z), but in this case it is used to remove mathematical characters**

In [18]:
for i in range(len(data)):
    data['pdf_content'][i] = re.sub(r'[^a-z]', ' ',data['pdf_content'][i])
# data['pdf_content'][0]

**Removing multiple whitespace with one whitespace**

In [19]:
for i in range(len(data)):
    data['pdf_content'][i] = ' '.join(data['pdf_content'][i].split())
# data['pdf_content'][0]

In [20]:
data

Unnamed: 0,pdf_content,subtopic
0,we present numerical simulation results for th...,Astrophysics
1,forecasting solar energetic particles seps and...,Astrophysics
2,context the spacebased multiband astronomical ...,Astrophysics
3,context in cold and shielded environments mole...,Astrophysics
4,we report the discovery of new example of the ...,Astrophysics
...,...,...
2030,finite frames or spanning sets for nitedimensi...,Symplectic Geometry
2031,due to the emergence of symplectic geometry th...,Symplectic Geometry
2032,in this article we modify the classical floer ...,Symplectic Geometry
2033,we prove one deformation theoretic extension o...,Symplectic Geometry


In [26]:
data['subtopic'].unique()

array(['Astrophysics', 'Condensed Matter',
       'General Relativity and Quantum Cosmology',
       'High Energy Physics - Experiment',
       'High Energy Physics - Lattice',
       'High Energy Physics - Phenomenology',
       'High Energy Physics - Theory', 'Mathematical Physics',
       'Nonlinear Sciences', 'Nuclear Experiment', 'Nuclear Theory',
       'Quantum Physics', 'Algebraic Geometry', 'Algebraic Topology',
       'Analysis of PDEs', 'Category Theory',
       'Classical Analysis and ODEs', 'Combinatorics',
       'Commutative Algebra', 'Complex Variables',
       'Differential Geometry', 'Dynamical Systems',
       'Functional Analysis', 'General Mathematics', 'General Topology',
       'Geometric Topology', 'Group Theory', 'History and Overview',
       'Information Theory', 'K-Theory and Homology', 'Logic',
       'Metric Geometry', 'Number Theory', 'Numerical Analysis',
       'Operator Algebras', 'Optimization and Control', 'Probability',
       'Quantum Algebra', 'Re

In [27]:
physics_subtopics = ['Astrophysics', 
                     'Condensed Matter', 
                     'General Relativity and Quantum Cosmology',
                     'High Energy Physics - Experiment',
                     'High Energy Physics - Lattice',
                     'High Energy Physics - Phenomenology',
                     'High Energy Physics - Theory',
                     'Mathematical Physics',
                     'Nonlinear Sciences',
                     'Nuclear Experiment',
                     'Nuclear Theory',
                     'Physics',
                     'Quantum Physics']

In [29]:
data['topic'] = ''

for i in range(len(data)):
    if data['subtopic'][i] in physics_subtopics:
        data['topic'][i] = 'Physics'
    else:
        data['topic'][i] = 'Mathematics'

**Tokenizing the pdf content**

In [33]:
tokenized_df = data.copy()

for i in range(len(data)):
    tokenized_df['pdf_content'][i] = word_tokenize(tokenized_df['pdf_content'][i])
# tokenized_df['pdf_content'][0]

**Remove stopwords**

In [None]:
stop_words = set(stopwords.words('english'))

for i in range(len(data)):
    tokenized_df['pdf_content'][i] = [w for w in tokenized_df['pdf_content'][i] if w not in stop_words]
tokenized_df['pdf_content']

**Lemmatizing the tokens**

In [36]:
lemmatized_df = tokenized_df.copy()

for i in range(len(tokenized_df)):
    lemmatized_df['pdf_content'][i] = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in lemmatized_df['pdf_content'][i]]
for i in range(len(tokenized_df)):
    lemmatized_df['pdf_content'][i] = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in lemmatized_df['pdf_content'][i]]

In [None]:
lemmatized_df['pdf_content'][0]

## 2. Creating the ML model

In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [50]:
joined_lemmatized_df = lemmatized_df.copy()

for i in range(len(lemmatized_df)):
    joined_lemmatized_df['pdf_content'][i] = ' '.join(word for word in joined_lemmatized_df['pdf_content'][i])


In [None]:
joined_lemmatized_df['pdf_content'][0]

In [None]:

tk = Tokenizer()
tk.fit_on_texts(joined_lemmatized_df['pdf_content'])
sequences = tk.texts_to_sequences(joined_lemmatized_df['pdf_content'])

print(sequences)

In [55]:
vocab_size = len(tk.word_index)
X_pad = pad_sequences(sequences, dtype='float32', padding='post')
print(X_pad.shape, vocab_size)

(2035, 45744) 182837


**Encoding the categorical column topic into binary**

In [66]:
from sklearn.preprocessing import OrdinalEncoder


y = data[['topic']]
enc = OrdinalEncoder()

y = enc.fit_transform(y)

In [67]:
y

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [68]:
### Let's build the neural network now
from tensorflow.keras import layers, Sequential

# Size of your embedding space = size of the vector representing each word
embedding_size = 10

model = Sequential()
model.add(layers.Embedding(
    input_dim=vocab_size+1,
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

model.add(layers.LSTM(10))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 50)          9141900   
                                                                 
 lstm_4 (LSTM)               (None, 10)                2440      
                                                                 
 dense_4 (Dense)             (None, 1)                 11        
                                                                 
Total params: 9,144,351
Trainable params: 9,144,351
Non-trainable params: 0
_________________________________________________________________


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y, test_size=0.3, random_state=1)

In [69]:


model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=5, batch_size=16, verbose=1)

Epoch 1/5
  1/128 [..............................] - ETA: 4:14:14 - loss: 0.6927 - accuracy: 0.6250

In [None]:
model.evaluate(X_test, y_test)