In [1]:
import pandas as pd
import numpy as np
import os
import string


# 1. Loading Data

## 1.1. Loading csv

In [3]:
def load_csv(path = '../raw_data/small_dataset.csv'): 
       
    '''creating an absolute path to be able
    to run the code on every machine '''
    
    abs_path = os.path.abspath(path)

    df = pd.read_csv(abs_path)
    return df

In [4]:
df = load_csv()
df.head()

Unnamed: 0.1,Unnamed: 0,0,subtopic
0,0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics


## 1.2. Preparing DataFrame

In [5]:
def preparing_dataframe(df):
    ''' 
    Changing the names of the columns of the df
    '''
    
    df.rename(
        columns={"Unnamed: 0":"Index",
            "0":"paper_text"}
              ,inplace=True)
    df.set_index("Index",inplace=True)
    return df
    

In [6]:
df[['subtopic']].value_counts()

subtopic                                
Mathematical Physics                        140
High Energy Physics - Theory                100
Astrophysics                                100
Quantum Physics                             100
Condensed Matter                            100
High Energy Physics - Phenomenology          99
General Relativity and Quantum Cosmology     95
Combinatorics                                90
Optimization and Control                     86
Probability                                  85
Analysis of PDEs                             84
Nonlinear Sciences                           72
Numerical Analysis                           67
Algebraic Geometry                           59
Dynamical Systems                            58
Information Theory                           56
High Energy Physics - Experiment             56
Number Theory                                50
Differential Geometry                        45
Nuclear Theory                               42

In [7]:
preparing_dataframe(df)


Unnamed: 0_level_0,paper_text,subtopic
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics
...,...,...
2030,2 2 0 2 g u A 3 2 ] A F . h t a m [ 1 v 5...,Symplectic Geometry
2031,UNIVERSIDAD COMPLUTENSE DE MADRID FACULTAD DE...,Symplectic Geometry
2032,2 2 0 2 g u A 0 2 ] G S . h t a m [ 1 v 4...,Symplectic Geometry
2033,2 2 0 2 g u A 6 1 ] G S . h t a m [ 1 v 4...,Symplectic Geometry


In [86]:
df.head()

Unnamed: 0_level_0,paper_text,subtopic,topic
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics,physics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics,physics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics,physics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics,physics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics,physics


## 1.3. Add topics columns


In [8]:
math_lst_topics = ["Algebraic Geometry",
"Algebraic Topology",
"Analysis of PDEs", 
"Category Theory", 
"Classical Analysis and ODEs", 
"Combinatorics", 
"Commutative Algebra", 
"Complex Variables", 
"Differential Geometry", 
"Dynamical Systems", 
"Functional Analysis", 
"General Mathematics", 
"General Topology", 
"Geometric Topology", 
"Group Theory", 
"History and Overview", 
"Information Theory", 
"K-Theory and Homology", 
"Logic; Mathematical Physics", 
"Metric Geometry", 
"Number Theory", 
"Numerical Analysis", 
"Operator Algebras", 
"Optimization and Control",
"Probability; Quantum Algebra", 
"Representation Theory", 
"Rings and Algebras", 
"Spectral Theory", 
"Statistics Theory", 
"Symplectic Geometry"]

In [43]:
lst_math_algebra = [
    "Algebraic Geometry",
    "Algebraic Geometry",
    "Algebraic Topology",
    "Commutative Algebra",
    "Differential Geometry",
    "General Topology",
    "Geometric Topology",
    "Group Theory",
    "Symplectic Geometry",
    "Representation Theory",
    "K-Theory and Homology",
    "Category Theory",
    "Quantum Algebra",
    "Spectral Theory",
    "Rings and Algebra",
    "Operator Algebra"
]
lst_math_num_analysis = [
    "Analysis of PDE",
    "Classical Analysis and ODE",
    "Functional Analysis",
    "General Mathematics",
    "Numerical Analysis"
]
lst_math_opti = [
    "Dynamical Systems",
    "Logic",
    "Optimization and Control"
]
lst_math_stat = [
    "Combinatorics",
    "Probability",
    "Statistics Theory"
]
lst_math_rest = [
    "Mathematical Physic",
    "Information Theory"
]


In [9]:
def add_topic(df,lst_math_topics):
    ''' 
    Add a columns topics that sort
    the papers by their sub topics
    '''
    df['topic'] = df['subtopic'].apply(lambda x: 'mathematic' if x in lst_math_topics else 'physics')
    return df

In [10]:
add_topic(df,math_lst_topics)

Unnamed: 0_level_0,paper_text,subtopic,topic
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics,physics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics,physics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics,physics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics,physics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics,physics
...,...,...,...
2030,2 2 0 2 g u A 3 2 ] A F . h t a m [ 1 v 5...,Symplectic Geometry,mathematic
2031,UNIVERSIDAD COMPLUTENSE DE MADRID FACULTAD DE...,Symplectic Geometry,mathematic
2032,2 2 0 2 g u A 0 2 ] G S . h t a m [ 1 v 4...,Symplectic Geometry,mathematic
2033,2 2 0 2 g u A 6 1 ] G S . h t a m [ 1 v 4...,Symplectic Geometry,mathematic


In [11]:
df.head()

Unnamed: 0_level_0,paper_text,subtopic,topic
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics,physics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics,physics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics,physics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics,physics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics,physics


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def def_X_y(df):
    '''
    Split the data frame into X_train and X_test
    oneHotencode the two target (topics and subtopics)
    '''
    X = df[['paper_text']]
    y_topic = df[['topic']]
    y_subtopic = df[['subtopic']]

    enc1 = OneHotEncoder(sparse = False, handle_unknown='ignore')
    y_topic_cat = enc1.fit_transform(df[['topic']])
    new_column_names1 = enc1.get_feature_names_out()
    y_topic_cat = pd.DataFrame(y_topic_cat)
    y_topic_cat.columns = new_column_names1

    enc2 = OneHotEncoder(sparse = False, handle_unknown='ignore')
    y_subtopic_cat = enc2.fit_transform(df[['subtopic']])
    new_column_names1 = enc2.get_feature_names_out()
    y_subtopic_cat = pd.DataFrame(y_subtopic_cat)
    y_subtopic_cat.columns = new_column_names1

    aggregated = X.join(y_topic_cat)
    aggregated = aggregated.join(y_subtopic_cat)

    X_train, X_test = train_test_split(aggregated, test_size=0.33)

    topic_col_name = []
    subtopic_col_name = []
    for elem in X_train.columns:
        if elem[0] == 't':
            topic_col_name.append(elem)
        elif elem[0] == 's':
            subtopic_col_name.append(elem)

    y_topic_train = X_train[topic_col_name]
    y_subtopic_train = X_train[subtopic_col_name]
    y_topic_test = X_test[topic_col_name]
    y_subtopic_test = X_test[subtopic_col_name]

    X_train = X_train[['paper_text']]
    X_test = X_test[['paper_text']]

    return X_train,X_test,y_topic_train,y_subtopic_train,y_topic_test,y_subtopic_test

In [13]:
X_train,X_test,y_topic_train,y_subtopic_train,y_topic_test,y_subtopic_test = def_X_y(df)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_topic_train.shape)
print(y_subtopic_train.shape)
print(y_topic_test.shape)
print(y_subtopic_test.shape)

(1363, 1)
(672, 1)
(1363, 2)
(1363, 43)
(672, 2)
(672, 43)


## 1.4. All together

In [85]:
def preparing_df(path,df,lst_math_topics):
    ''' 
    putting the dataframe in the right shape
    '''
    return add_topic(preparing_dataframe(load_csv(path)),lst_math_topics)

# 2. Preprocessing

## 2.1. Cutting the abstract

In [12]:
def cutting_abs(text):
    ''' 
    Cutting everything before the abstract
    '''
    return text[text.find("ABSTRACT")+8:]

## 2.2. lowercase 

In [13]:
def lowercase(text):
    ''' 
    Put the text in lowercase
    '''
    return text.lower()

## 2.3. Removing digits

In [14]:
def remove_digit(text):
    ''' 
    Removing the digits
    '''
    cleaned_text = ''.join(char for char in text if not char.isdigit())
    return cleaned_text

## 2.4. Removing Punctuaction

In [15]:
def remove_punctuation(text):
    ''' 
    Removing the punctuation
    '''
    cleaned_text = ''.join(char for char in text if char not in string.punctuation)
    
    return cleaned_text

## 2.5. Removing special characters 

In [16]:
import re
def try_with_regular_expression(text):
    ''' 
    Removing the last special characters 
    using Regex
    '''
    new_string = re.sub(r"[^a-z]"," ",text)
    return new_string

## 2.6. Normalize space

In [17]:
def normalize_space_man(text):
    ''' 
    Normalize all the space in one character size
    '''
    lst_word = text.split()
    return " ".join(word for word in lst_word)
    

## 2.7. Removing single characters words

In [18]:
def remove_single(text):
    ''' 
    A lot of single characters word appears during
    prepro(due to mathematical formula) and we
    don't want them.
    '''
    lst_word = text.split()
    return " ".join(word for word in lst_word if len(word)>1)

# 3. Global cleaning

In [19]:
def pre_clean_data(text):
    ''' 
    Applying all the cleaning functions together
    '''
    cleaned_text = remove_single(normalize_space_man(try_with_regular_expression(remove_punctuation(remove_digit(lowercase(cutting_abs(text)))))))
    return cleaned_text

# 4. Tokkenize, Lemmatize, remove stopsword

## 4.1. tokkenizing lemm stopswords

In [20]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk import WordNetLemmatizer

In [21]:
def tokkenize_words(text):
    return word_tokenize(text)

In [22]:
def remove_stopwords(lst_word):
    stop_words = set(stopwords.words('english'))
    return [word for word in lst_word if not word in stop_words] 

In [23]:
def lemmatize(lst_word):
    # Lemmatizing the verbs
    
    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
                  for word in lst_word]

    # 2 - Lemmatizing the nouns
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
                  for word in verb_lemmatized]
    return noun_lemmatized

In [24]:
def final_pre_pro(text:str)->str:
    ''' 
    the input is the dirty text
    The output is a cleaned text
    '''
    def tok_sw_lem(text):
        return lemmatize(remove_stopwords(tokkenize_words(text)))
    
    def get_text_back(lst_word):
        return " ".join(word for word in lst_word)
    
    return get_text_back(tok_sw_lem(pre_clean_data(text)))

In [29]:
#final_pre_pro(df['paper_text'][0])

# 5. Creating pre-processor pipeline

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

# from WorkingPaper.script.pre_pro import final_pre_pro

In [97]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Flatten
#len(tokenizer.word_index)

In [92]:

def preprocess_features(X):

    
    cleaning_pipe = make_pipeline(
        FunctionTransformer(lambda sentence: sentence.map(final_pre_pro))
    )
    text_preprocessed = cleaning_pipe.fit_transform(X['paper_text'])
    return text_preprocessed

In [95]:
#X['paper_text'][0]
preprocess_features(X_train)

Index
522     modular graph form ii iterate integral uuitp m...
513     submission jhep note ramondramond spinors bisp...
81      report coj detection molecular cloud extend ul...
1249    bound multigraded regularity juliette bruce la...
1036    radial solution fully non linear degenerate si...
                              ...                        
1130    qualitative analysis solution mixedorder posit...
1294    let relatively compact connect open subset smo...
860     current literature quantum key distribution qk...
1459    word measure unitary group improve bound small...
1126    aximal operator haar multiplier variable lebes...
Name: paper_text, Length: 1363, dtype: object

# 6. model

In [99]:
#from WorkingPaper.preprocessing import final_pre_pro
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Flatten

In [None]:
def model_conv(X_train):
    ''' 
    Buiding the model
    '''
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    ## Vectorizing data 
    X_train_vect = pad_sequences(tokenizer.texts_to_sequences(X_train), padding="post", value=0.)
    
    embed_len = 50

    # Conv1D
    cnn = Sequential([
        Embedding(input_dim=len(tokenizer.word_index)+1, input_length=X_train_vect.shape[1], output_dim=embed_len, mask_zero=True),
        Conv1D(20, kernel_size=3),
        Flatten(),
        Dense(1, activation="sigmoid"),
    ])

    cnn.summary()
    cnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return cnn
    