In [2]:
import pandas as pd
import numpy as np
import os
import string


# 1. reading .csv + preparing df

## 1.1. reading

In [3]:
pre_path = "../raw_data/"
path = "small_dataset.csv"
## creating an absolute path to be able to run the code on every machine
abs_path = os.path.abspath(pre_path + path)


df = pd.read_csv(abs_path)
df.head()

Unnamed: 0.1,Unnamed: 0,0,subtopic
0,0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics


## 1.2. preparing df

In [4]:
df.rename(
    columns={"Unnamed: 0":"Index",
        "0":"paper_text"}
          ,inplace=True)
df.set_index("Index",inplace=True)

In [5]:
df[['subtopic']].value_counts()

subtopic                                
Mathematical Physics                        140
High Energy Physics - Theory                100
Astrophysics                                100
Quantum Physics                             100
Condensed Matter                            100
High Energy Physics - Phenomenology          99
General Relativity and Quantum Cosmology     95
Combinatorics                                90
Optimization and Control                     86
Probability                                  85
Analysis of PDEs                             84
Nonlinear Sciences                           72
Numerical Analysis                           67
Algebraic Geometry                           59
Dynamical Systems                            58
Information Theory                           56
High Energy Physics - Experiment             56
Number Theory                                50
Differential Geometry                        45
Nuclear Theory                               42

## 1.3. Categories filter

In [6]:
math_lst_topics = ["Algebraic Geometry",
"Algebraic Topology",
"Analysis of PDEs", 
"Category Theory", 
"Classical Analysis and ODEs", 
"Combinatorics", 
"Commutative Algebra", 
"Complex Variables", 
"Differential Geometry", 
"Dynamical Systems", 
"Functional Analysis", 
"General Mathematics", 
"General Topology", 
"Geometric Topology", 
"Group Theory", 
"History and Overview", 
"Information Theory", 
"K-Theory and Homology", 
"Logic; Mathematical Physics", 
"Metric Geometry", 
"Number Theory", 
"Numerical Analysis", 
"Operator Algebras", 
"Optimization and Control",
"Probability; Quantum Algebra", 
"Representation Theory", 
"Rings and Algebras", 
"Spectral Theory", 
"Statistics Theory", 
"Symplectic Geometry"]


In [64]:
lst_math_algebra = [
    "Algebraic Geometry",
    "Algebraic Geometry",
    "Algebraic Topology",
    "Commutative Algebra",
    "Differential Geometry",
    "General Topology",
    "Geometric Topology",
    "Group Theory",
    "Symplectic Geometry",
    "Representation Theory",
    "K-Theory and Homology",
    "Category Theory",
    "Quantum Algebra",
    "Spectral Theory",
    "Rings and Algebra",
    "Operator Algebra"
]
lst_math_num_analysis = [
    "Analysis of PDE",
    "Classical Analysis and ODE",
    "Functional Analysis",
    "General Mathematics",
    "Numerical Analysis"
]
lst_math_opti = [
    "Dynamical Systems",
    "Logic",
    "Optimization and Control"
]
lst_math_stat = [
    "Combinatorics",
    "Probability",
    "Statistics Theory"
]
lst_math_rest = [
    "Mathematical Physic",
    "Information Theory"
]



## 1.4. add main topic column

In [7]:
df['topic'] = df['subtopic'].apply(lambda x: 'mathematic' if x in math_lst_topics else 'physics')

In [8]:
df[['topic']].value_counts()

topic     
physics       1053
mathematic     982
dtype: int64

## 1.5. test loading CSV

In [9]:
#df['paper_text'][0]

# 2. Preprocessing

## 2.1. Cutting the abstract

In [10]:
def cutting_abs(text):
    return text[text.find("ABSTRACT")+8:]

#df['paper_text'][0][df['paper_text'][0].find('ABSTRACT')+8:]

In [11]:
#cutting_abs(df['paper_text'][0])

## 2.2. lowercase 

In [12]:
def lowercase(text):
    return text.lower()

In [13]:
lowercase('chdcdGhHnjcndHDHEBH')

'chdcdghhnjcndhdhebh'

## 2.3. Removing digits

In [14]:
def remove_digit(text):
    cleaned_text = ''.join(char for char in text if not char.isdigit())
    return cleaned_text

In [15]:
remove_digit('73838bhccfhdcbhd3njcdj344ncjdknkj576')

'bhccfhdcbhdnjcdjncjdknkj'

## 2.4. Removing Punctuaction

In [16]:
def remove_punctuation(text):
    cleaned_text = ''.join(char for char in text if char not in string.punctuation)
    
    return cleaned_text

In [17]:
remove_punctuation(':;:;:;))) hd jcjd!!("')
#string.punctuation

' hd jcjd'

## 2.5. Removing special characters 

In [18]:
import re
def try_with_regular_expression(text):
    new_string = re.sub(r"[^a-z]"," ",text)
    return new_string

In [19]:
try_with_regular_expression('cndjncjd-)àe"ç"à"à"-')

'cndjncjd   e        '

In [20]:
## must pip install cleantext
#import cleantext
#def normalize_space(text):
#    normalized_string = cleantext.clean(string, normalize_whitespace=True)
#    return normalized_string

## 2.6. Normalize space

In [21]:
## doing it manually with split and strip
def normalize_space_man(text):
    lst_word = text.split()
    return " ".join(word for word in lst_word)
    
    

In [22]:
test  = 'Bonjour   Je suis un    adulte'
normalize_space_man(test)

'Bonjour Je suis un adulte'

## 2.7. Removing single words

In [23]:
## remove single character word
def remove_single(text):
    lst_word = text.split()
    return " ".join(word for word in lst_word if len(word)>1)

In [24]:
remove_single('je suis e un z adulte')

'je suis un adulte'

# 3. Global cleaning

In [25]:
## function all together
def pre_clean_data(text):
    cleaned_text = remove_single(normalize_space_man(try_with_regular_expression(remove_punctuation(remove_digit(lowercase(cutting_abs(text)))))))
    return cleaned_text

In [26]:
#pre_clean_data(df['paper_text'][0])

# 4. Tokkenize, Lemmatize, remove stopsword

## 4.1. tokkenizing lemm stopswords

In [27]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk import WordNetLemmatizer

In [28]:
def tokkenize_words(text):
    return word_tokenize(text)

In [29]:
#tokkenize_words(df['paper_text'][0])

In [30]:
def remove_stopwords(lst_word):
    stop_words = set(stopwords.words('english'))
    return [word for word in lst_word if not word in stop_words] 

In [31]:
def lemmatize(lst_word):
    # Lemmatizing the verbs
    
    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
                  for word in lst_word]

    # 2 - Lemmatizing the nouns
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
                  for word in verb_lemmatized]
    return noun_lemmatized

## 4.2. All together

In [32]:
def tok_sw_lem(text):
    return lemmatize(remove_stopwords(tokkenize_words(text)))

In [33]:
## put it back into a string for the deeplearning model
def get_text_back(lst_word):
    return " ".join(word for word in lst_word)

In [34]:
#get_text_back(tok_sw_lem(pre_clean_data(df['paper_text'][0])))

In [35]:
def final_pre_pro(text):
    return get_text_back(tok_sw_lem(pre_clean_data(text)))

# 5. Creating the list of pdf X and the target y For the main model

In [36]:
#df['paper_text'][:3]
X = df['paper_text'].apply(lambda x: final_pre_pro(x))


In [37]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [38]:
enc = OrdinalEncoder()
y = enc.fit_transform(df[['topic']])

# 6. First model

In [40]:
1+1

2

In [41]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [50]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)


## Vectorizing data 
X_vect = pad_sequences(tokenizer.texts_to_sequences(X), padding="post", value=0.)



print(X_vect[:3])


X_vect.shape


[[  144   318   328 ...     0     0     0]
 [ 6090  1977  4965 ...     0     0     0]
 [  820  8136 28213 ...     0     0     0]]


(2035, 45511)

In [49]:
X_train_vect, X_test_vect, y_train, y_test = train_test_split(X_vect, y, test_size=0.33, random_state=42)

In [51]:
X_train_vect.shape,X_test_vect.shape, y_train.shape

((1363, 45511), (672, 45511), (1363, 1))

In [58]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Flatten
len(tokenizer.word_index)

185616

In [63]:

embed_len = 50

# Conv1D
cnn = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, input_length=X_train_vect.shape[1], output_dim=embed_len, mask_zero=True),
    Conv1D(20, kernel_size=3),
    Flatten(),
    Dense(1, activation="sigmoid"),
])

cnn.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 45511, 50)         9280850   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 45509, 20)         3020      
_________________________________________________________________
flatten_4 (Flatten)          (None, 910180)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 910181    
Total params: 10,194,051
Trainable params: 10,194,051
Non-trainable params: 0
_________________________________________________________________


In [None]:
cnn.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = cnn.fit(X_train_vect, y_train, epochs=5, batch_size=16, verbose=1)

In [None]:
cnn.evaluate(X_test_vect,y_test)