In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim.downloader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import layers, Sequential #, Dense, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Early Experimenting in order to get to a BaseModel
#### with only minimal preprocessing

In [40]:
# Reading Data
data = pd.read_csv('../raw_data/small_dataset.csv')
data = data.drop(columns='Unnamed: 0')
data = data.rename(columns={'0': 'text'})
data.head()

Unnamed: 0,text,subtopic
0,2 2 0 2 g u A 6 2 ] E H . h p - o r t s a [ ...,Astrophysics
1,"Draft version August 29, 2022 Typeset using LA...",Astrophysics
2,Astronomy & Astrophysics manuscript no. 41891c...,Astrophysics
3,Astronomy & Astrophysics manuscript no. aa Aug...,Astrophysics
4,2 2 0 2 g u A 6 2 ] R S . h p - o r t s a [...,Astrophysics


In [41]:
# Lowercasing everything
data = data.apply(lambda x: x.astype(str).str.lower())

In [42]:
# Removing anything apart from lower case letters 
data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-z]', ' ', x))

In [44]:
# Tokenizing
data['modified text'] = data['text'].apply(word_tokenize)

In [49]:
# Stopwords Removal and Removal of words consisting of single letters
stop_words = set(stopwords.words('english'))

data['modified text'] = data['modified text'].apply(lambda x: [word for word in x if not word in stop_words])
data['modified text'] = data['modified text'].apply(lambda x: [word for word in x if len(word)>1])

data['words per text'] = data['text'].apply(lambda x : len(x)) 
data['words per modified text'] = data['modified text'].apply(lambda x : len(x)) 
data

Unnamed: 0,text,subtopic,modified text,words per text,words per modified text
0,g u a e h h p o r t s a ...,astrophysics,"[mnras, preprint, august, compiled, using, mnr...",58690,4674
1,draft version august typeset using la...,astrophysics,"[draft, version, august, typeset, using, latex...",55930,5331
2,astronomy astrophysics manuscript no c...,astrophysics,"[astronomy, astrophysics, manuscript, corr, au...",46117,4103
3,astronomy astrophysics manuscript no aa aug...,astrophysics,"[astronomy, astrophysics, manuscript, aa, augu...",78395,7614
4,g u a r s h p o r t s a ...,astrophysics,"[mnras, preprint, august, compiled, using, mnr...",61921,5454
...,...,...,...,...,...
2030,g u a a f h t a m v ...,symplectic geometry,"[fusion, frame, homotopy, tightening, fusion, ...",70325,6085
2031,universidad complutense de madrid facultad de...,symplectic geometry,"[universidad, complutense, de, madrid, faculta...",79210,4816
2032,g u a g s h t a m v ...,symplectic geometry,"[barcode, pair, compact, exact, lagrangians, p...",46697,3556
2033,g u a g s h t a m v ...,symplectic geometry,"[locally, conformally, symplectic, deformation...",26625,2023


In [57]:
# Target Encoding, but turning subtopics into topics first
data['subtopic'].unique()

array(['astrophysics', 'condensed matter',
       'general relativity and quantum cosmology',
       'high energy physics - experiment',
       'high energy physics - lattice',
       'high energy physics - phenomenology',
       'high energy physics - theory', 'mathematical physics',
       'nonlinear sciences', 'nuclear experiment', 'nuclear theory',
       'quantum physics', 'algebraic geometry', 'algebraic topology',
       'analysis of pdes', 'category theory',
       'classical analysis and odes', 'combinatorics',
       'commutative algebra', 'complex variables',
       'differential geometry', 'dynamical systems',
       'functional analysis', 'general mathematics', 'general topology',
       'geometric topology', 'group theory', 'history and overview',
       'information theory', 'k-theory and homology', 'logic',
       'metric geometry', 'number theory', 'numerical analysis',
       'operator algebras', 'optimization and control', 'probability',
       'quantum algebra', 're

In [68]:
physics_list = ['astrophysics', 'condensed matter', 'general relativity and quantum cosmology', 'high energy physics - experiment', 
                'high energy physics - lattice', 'high energy physics - phenomenology', 'high energy physics - Tteory',
                'mathematical physics', 'nonlinear sciences', 'nuclear experiment', 'nuclear theory', 'physics', 'quantum physics']

In [76]:
# Physics as mapped with 0, Maths is mapped with 1
data['topic'] = data['subtopic'].apply(lambda x: 0 if x in physics_list else 1)

In [77]:
data['topic'].value_counts()

1    1196
0     839
Name: topic, dtype: int64

In [82]:
data[['modified text', 'topic']]

Unnamed: 0,modified text,topic
0,"[mnras, preprint, august, compiled, using, mnr...",0
1,"[draft, version, august, typeset, using, latex...",0
2,"[astronomy, astrophysics, manuscript, corr, au...",0
3,"[astronomy, astrophysics, manuscript, aa, augu...",0
4,"[mnras, preprint, august, compiled, using, mnr...",0
...,...,...
2030,"[fusion, frame, homotopy, tightening, fusion, ...",1
2031,"[universidad, complutense, de, madrid, faculta...",1
2032,"[barcode, pair, compact, exact, lagrangians, p...",1
2033,"[locally, conformally, symplectic, deformation...",1


In [79]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(data['modified text'], data['topic'], test_size=0.3)

In [8]:
# Downloading a pre-trained model, based on a 50 space vector representation from wikidata
model_wiki = gensim.downloader.load('glove-wiki-gigaword-50')

In [12]:
# Checing the shape of the vector
model_wiki['saturn'].shape

(50,)

In [None]:
glove_file = datapath('~/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz')
tmp_file = get_tmpfile("test_word2vec.txt")
#tmp_file = data['modified text']

_ = glove2word2vec(glove_file, tmp_file)

glove_vectors = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
# Add Padding
# data['modified text padded']
X_pad = pad_sequences(data['modified text'], dtype='str', padding='post', value=0)

### Neural network architechture with three components:

    An embedding layer that generates word embedding, and the parameters are shared across words.
    A hidden layer of one or more layers, which introduces non-linearity to the embeddings.
    A softmax function that produces probability distribution over all the words in the vocabulary. 



In [None]:
embedding_size = 50 # same as in the pretrained model

model = Sequential()

model.add(layers.LSTM(20))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])