# Group Project: Arxiv Classifier
Authors: Amir Yunus, Brandon Gay, Lee Oon Teng

In [1]:
from utils import load_data, save_data
import utils.feature_engineering as engineer

# 03 - Feature Engineering

## 03.01 - Load the Data

In [2]:
filename = 'arxiv_lite'
df = load_data(f'../data/01_{filename}_processed.bin')

Unnamed: 0,title,summary,comment,authors,category,split
0,gammaray bursts death throes massive binary stars,proposed gammaray bursts created mergers doubl...,pages,ramesh narayan bohdan paczynski tsvi piran,physics,test
1,gravitational lensing variability g,four observables associated gravitational lens...,pages plus figures included,lawrence krauss martin white,physics,val
2,ptolemaic gammaray burst universe,batse experiment gro demonstrated isotropic ar...,pages replaced provide omitted line,j katz,physics,train
3,expanding photospheres type ii supernovae extr...,use expanding photosphere method determine dis...,pages,b p schmidt r p kirshner r g eastman,physics,train
4,radiation transfer gammaray bursts,calculated gammaray radiative transport region...,pages,b j carrigan j katz,physics,test


## 03.02 - Tokenize the Data

In [3]:
df = engineer.tokenize(df)
df.head()

Unnamed: 0,title,summary,comment,authors,category,split
0,"[gammaray, bursts, death, throes, massive, bin...","[proposed, gammaray, bursts, created, mergers,...",[pages],"[ramesh, narayan, bohdan, paczynski, tsvi, piran]",physics,test
1,"[gravitational, lensing, variability, g]","[four, observables, associated, gravitational,...","[pages, plus, figures, included]","[lawrence, krauss, martin, white]",physics,val
2,"[ptolemaic, gammaray, burst, universe]","[batse, experiment, gro, demonstrated, isotrop...","[pages, replaced, provide, omitted, line]","[j, katz]",physics,train
3,"[expanding, photospheres, type, ii, supernovae...","[use, expanding, photosphere, method, determin...",[pages],"[b, p, schmidt, r, p, kirshner, r, g, eastman]",physics,train
4,"[radiation, transfer, gammaray, bursts]","[calculated, gammaray, radiative, transport, r...",[pages],"[b, j, carrigan, j, katz]",physics,test


## 03.03 - Lemmatize the Data

In [4]:
df = engineer.lemmatize(df)
df.head()

Unnamed: 0,title,summary,comment,authors,category,split
0,"[gammaray, bursts, death, throes, massive, bin...","[proposed, gammaray, bursts, created, mergers,...",[pages],"[ramesh, narayan, bohdan, paczynski, tsvi, piran]",physics,test
1,"[gravitational, lensing, variability, g]","[four, observables, associated, gravitational,...","[pages, plus, figures, included]","[lawrence, krauss, martin, white]",physics,val
2,"[ptolemaic, gammaray, burst, universe]","[batse, experiment, gro, demonstrated, isotrop...","[pages, replaced, provide, omitted, line]","[j, katz]",physics,train
3,"[expanding, photospheres, type, ii, supernovae...","[use, expanding, photosphere, method, determin...",[pages],"[b, p, schmidt, r, p, kirshner, r, g, eastman]",physics,train
4,"[radiation, transfer, gammaray, bursts]","[calculated, gammaray, radiative, transport, r...",[pages],"[b, j, carrigan, j, katz]",physics,test


## 03.04 - Vectorize the Data

In [5]:
df = engineer.vectorize(df)
df.head()

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_emb_758,authors_emb_759,authors_emb_760,authors_emb_761,authors_emb_762,authors_emb_763,authors_emb_764,authors_emb_765,authors_emb_766,authors_emb_767
0,"['gammaray', 'bursts', 'death', 'throes', 'mas...","['proposed', 'gammaray', 'bursts', 'created', ...",['pages'],"['ramesh', 'narayan', 'bohdan', 'paczynski', '...",physics,test,-0.336807,0.205785,0.174136,-0.176798,...,-0.230609,0.555423,-0.263518,-0.273223,0.082736,0.437486,-0.462336,-0.291127,0.521756,0.81472
1,"['gravitational', 'lensing', 'variability', 'g']","['four', 'observables', 'associated', 'gravita...","['pages', 'plus', 'figures', 'included']","['lawrence', 'krauss', 'martin', 'white']",physics,val,-0.193135,0.019145,0.063621,-0.222345,...,-0.010094,0.379848,-0.465717,-0.118567,-0.389373,0.484199,-0.709159,-0.143599,0.40596,1.147051
2,"['ptolemaic', 'gammaray', 'burst', 'universe']","['batse', 'experiment', 'gro', 'demonstrated',...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.307646,0.030398,-0.15352,-0.222931,...,0.068673,0.037468,-0.240701,-0.066213,-0.143102,0.041454,-0.192547,-0.28208,0.475934,0.917386
3,"['expanding', 'photospheres', 'type', 'ii', 's...","['use', 'expanding', 'photosphere', 'method', ...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'kirshner', 'r...",physics,train,-0.488771,0.154944,0.239354,-0.140129,...,0.199931,0.206892,-0.26151,-0.330638,-0.219508,0.390227,-0.487357,-0.242951,0.297407,1.077118
4,"['radiation', 'transfer', 'gammaray', 'bursts']","['calculated', 'gammaray', 'radiative', 'trans...",['pages'],"['b', 'j', 'carrigan', 'j', 'katz']",physics,test,-0.167175,0.223677,-0.232622,-0.130705,...,0.254578,0.157032,-0.312766,-0.113976,-0.09607,0.141242,-0.404165,-0.09988,0.449213,0.994779


## 03.05 - Word Count

In [6]:
df = engineer.word_count(df)
df.head()

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_emb_762,authors_emb_763,authors_emb_764,authors_emb_765,authors_emb_766,authors_emb_767,title_word_count,summary_word_count,comment_word_count,authors_word_count
0,"['gammaray', 'bursts', 'death', 'throes', 'mas...","['proposed', 'gammaray', 'bursts', 'created', ...",['pages'],"['ramesh', 'narayan', 'bohdan', 'paczynski', '...",physics,test,-0.336807,0.205785,0.174136,-0.176798,...,0.082736,0.437486,-0.462336,-0.291127,0.521756,0.81472,7,72,1,6
1,"['gravitational', 'lensing', 'variability', 'g']","['four', 'observables', 'associated', 'gravita...","['pages', 'plus', 'figures', 'included']","['lawrence', 'krauss', 'martin', 'white']",physics,val,-0.193135,0.019145,0.063621,-0.222345,...,-0.389373,0.484199,-0.709159,-0.143599,0.40596,1.147051,4,68,4,4
2,"['ptolemaic', 'gammaray', 'burst', 'universe']","['batse', 'experiment', 'gro', 'demonstrated',...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.307646,0.030398,-0.15352,-0.222931,...,-0.143102,0.041454,-0.192547,-0.28208,0.475934,0.917386,4,83,5,2
3,"['expanding', 'photospheres', 'type', 'ii', 's...","['use', 'expanding', 'photosphere', 'method', ...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'kirshner', 'r...",physics,train,-0.488771,0.154944,0.239354,-0.140129,...,-0.219508,0.390227,-0.487357,-0.242951,0.297407,1.077118,8,157,1,9
4,"['radiation', 'transfer', 'gammaray', 'bursts']","['calculated', 'gammaray', 'radiative', 'trans...",['pages'],"['b', 'j', 'carrigan', 'j', 'katz']",physics,test,-0.167175,0.223677,-0.232622,-0.130705,...,-0.09607,0.141242,-0.404165,-0.09988,0.449213,0.994779,4,91,1,5


## 03.06 - Named Entity Recognition

In [7]:
df = engineer.named_entity_recognition(df)
df.head()

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_ner_MONEY_count,authors_ner_NORP_count,authors_ner_ORDINAL_count,authors_ner_ORG_count,authors_ner_PERCENT_count,authors_ner_PERSON_count,authors_ner_PRODUCT_count,authors_ner_QUANTITY_count,authors_ner_TIME_count,authors_ner_WORK_OF_ART_count
0,"['gammaray', 'bursts', 'death', 'throes', 'mas...","['proposed', 'gammaray', 'bursts', 'created', ...",['pages'],"['ramesh', 'narayan', 'bohdan', 'paczynski', '...",physics,test,-0.336807,0.205785,0.174136,-0.176798,...,0,0,0,0,0,0,0,0,0,0
1,"['gravitational', 'lensing', 'variability', 'g']","['four', 'observables', 'associated', 'gravita...","['pages', 'plus', 'figures', 'included']","['lawrence', 'krauss', 'martin', 'white']",physics,val,-0.193135,0.019145,0.063621,-0.222345,...,0,0,0,0,0,0,0,0,0,0
2,"['ptolemaic', 'gammaray', 'burst', 'universe']","['batse', 'experiment', 'gro', 'demonstrated',...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.307646,0.030398,-0.15352,-0.222931,...,0,0,0,0,0,0,0,0,0,0
3,"['expanding', 'photospheres', 'type', 'ii', 's...","['use', 'expanding', 'photosphere', 'method', ...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'kirshner', 'r...",physics,train,-0.488771,0.154944,0.239354,-0.140129,...,0,0,0,0,0,0,0,0,0,0
4,"['radiation', 'transfer', 'gammaray', 'bursts']","['calculated', 'gammaray', 'radiative', 'trans...",['pages'],"['b', 'j', 'carrigan', 'j', 'katz']",physics,test,-0.167175,0.223677,-0.232622,-0.130705,...,0,0,0,0,0,0,0,0,0,0


## 03.07 - Sentiment Analysis

In [8]:
df = engineer.sentiment_analysis(df)
df.head()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_ner_PERCENT_count,authors_ner_PERSON_count,authors_ner_PRODUCT_count,authors_ner_QUANTITY_count,authors_ner_TIME_count,authors_ner_WORK_OF_ART_count,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment
0,"['gammaray', 'bursts', 'death', 'throes', 'mas...","['proposed', 'gammaray', 'bursts', 'created', ...",['pages'],"['ramesh', 'narayan', 'bohdan', 'paczynski', '...",physics,test,-0.336807,0.205785,0.174136,-0.176798,...,0,0,0,0,0,0,0.0,-0.08545,0.0,0.0
1,"['gravitational', 'lensing', 'variability', 'g']","['four', 'observables', 'associated', 'gravita...","['pages', 'plus', 'figures', 'included']","['lawrence', 'krauss', 'martin', 'white']",physics,val,-0.193135,0.019145,0.063621,-0.222345,...,0,0,0,0,0,0,0.0,-0.055556,0.0,0.0
2,"['ptolemaic', 'gammaray', 'burst', 'universe']","['batse', 'experiment', 'gro', 'demonstrated',...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.307646,0.030398,-0.15352,-0.222931,...,0,0,0,0,0,0,0.0,0.026833,0.0,0.0
3,"['expanding', 'photospheres', 'type', 'ii', 's...","['use', 'expanding', 'photosphere', 'method', ...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'kirshner', 'r...",physics,train,-0.488771,0.154944,0.239354,-0.140129,...,0,0,0,0,0,0,0.0,0.033333,0.0,0.0
4,"['radiation', 'transfer', 'gammaray', 'bursts']","['calculated', 'gammaray', 'radiative', 'trans...",['pages'],"['b', 'j', 'carrigan', 'j', 'katz']",physics,test,-0.167175,0.223677,-0.232622,-0.130705,...,0,0,0,0,0,0,0.0,0.109,0.0,0.0


## 03.08 - Text Complexity

In [9]:
df = engineer.text_complexity(df)
df.head()

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_ner_TIME_count,authors_ner_WORK_OF_ART_count,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment,title_ari,summary_ari,comment_ari,authors_ari
0,"['gammaray', 'bursts', 'death', 'throes', 'mas...","['proposed', 'gammaray', 'bursts', 'created', ...",['pages'],"['ramesh', 'narayan', 'bohdan', 'paczynski', '...",physics,test,-0.336807,0.205785,0.174136,-0.176798,...,0,0,0.0,-0.08545,0.0,0.0,14,14,14.0,14.0
1,"['gravitational', 'lensing', 'variability', 'g']","['four', 'observables', 'associated', 'gravita...","['pages', 'plus', 'figures', 'included']","['lawrence', 'krauss', 'martin', 'white']",physics,val,-0.193135,0.019145,0.063621,-0.222345,...,0,0,0.0,-0.055556,0.0,0.0,14,14,14.0,14.0
2,"['ptolemaic', 'gammaray', 'burst', 'universe']","['batse', 'experiment', 'gro', 'demonstrated',...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.307646,0.030398,-0.15352,-0.222931,...,0,0,0.0,0.026833,0.0,0.0,14,14,14.0,7.83
3,"['expanding', 'photospheres', 'type', 'ii', 's...","['use', 'expanding', 'photosphere', 'method', ...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'kirshner', 'r...",physics,train,-0.488771,0.154944,0.239354,-0.140129,...,0,0,0.0,0.033333,0.0,0.0,14,14,14.0,12.376667
4,"['radiation', 'transfer', 'gammaray', 'bursts']","['calculated', 'gammaray', 'radiative', 'trans...",['pages'],"['b', 'j', 'carrigan', 'j', 'katz']",physics,test,-0.167175,0.223677,-0.232622,-0.130705,...,0,0,0.0,0.109,0.0,0.0,14,14,14.0,10.272


## 03.09 - Prepare the Data for Modelling

In [10]:
df = engineer.prepare(df)
df.head()

Unnamed: 0,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,title_emb_7,title_emb_8,title_emb_9,...,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment,title_ari,summary_ari,comment_ari,authors_ari,category,split
0,-0.336807,0.205785,0.174136,-0.176798,-0.305941,0.322368,0.129586,0.781946,-0.403417,-0.099831,...,0.0,-0.08545,0.0,0.0,14,14,14.0,14.0,physics,test
1,-0.193135,0.019145,0.063621,-0.222345,-0.535467,0.131508,0.126612,0.577781,-0.560718,-0.176797,...,0.0,-0.055556,0.0,0.0,14,14,14.0,14.0,physics,val
2,-0.307646,0.030398,-0.15352,-0.222931,-0.390395,0.065084,0.222042,0.624599,-0.591892,-0.031597,...,0.0,0.026833,0.0,0.0,14,14,14.0,7.83,physics,train
3,-0.488771,0.154944,0.239354,-0.140129,-0.166575,0.140046,0.091391,0.68305,-0.515521,-0.219456,...,0.0,0.033333,0.0,0.0,14,14,14.0,12.376667,physics,train
4,-0.167175,0.223677,-0.232622,-0.130705,-0.447192,0.078025,0.250789,0.606733,-0.562184,-0.103338,...,0.0,0.109,0.0,0.0,14,14,14.0,10.272,physics,test


## 03.10 - Save the Data

In [11]:
save_data(df, f'../data/03_{filename}_engineered.bin')

Data saved successfully to ../data/03_arxiv_lite_engineered.bin
