# Group Project: Arxiv Classifier
Authors: Amir Yunus, Brandon Gay, Lee Oon Teng

In [1]:
from utils import load_data, save_data
import utils.feature_engineering as engineer

# 03 - Feature Engineering

## 03.01 - Load the Data

In [2]:
filename = 'arxiv'
df = load_data(f'../data/01_{filename}_processed.bin')
df.shape

Unnamed: 0,title,summary,comment,authors,category,split
0,gammaray bursts death throes massive binary stars,proposed gammaray bursts created mergers doubl...,pages,ramesh narayan bohdan paczynski tsvi piran,physics,train
1,gravitational lensing variability g,four observables associated gravitational lens...,pages plus figures included,lawrence krauss martin white,physics,test
2,ptolemaic gammaray burst universe,batse experiment gro demonstrated isotropic ar...,pages replaced provide omitted line,j katz,physics,train
3,expanding photospheres type ii supernovae extr...,use expanding photosphere method determine dis...,pages,b p schmidt r p kirshner r g eastman,physics,val
4,radiation transfer gammaray bursts,calculated gammaray radiative transport region...,pages,b j carrigan j katz,physics,train


(471879, 6)

## 03.02 - Tokenize and Lemmatize the Data

In [3]:
df = engineer.lemmatize(df, batch_size = 20_000)
df.head()

Using 8 GPUs!


Lemmatizing columns:   0%|          | 0/4 [00:00<?, ?it/s]
Processing text column:   0%|          | 0/24 [00:00<?, ?it/s][A
Processing text column:   4%|▍         | 1/24 [00:02<01:03,  2.77s/it][A
Processing text column:   8%|▊         | 2/24 [00:05<01:01,  2.79s/it][A
Processing text column:  12%|█▎        | 3/24 [00:08<01:00,  2.88s/it][A
Processing text column:  17%|█▋        | 4/24 [00:11<00:57,  2.89s/it][A
Processing text column:  21%|██        | 5/24 [00:14<00:53,  2.82s/it][A
Processing text column:  25%|██▌       | 6/24 [00:16<00:50,  2.79s/it][A
Processing text column:  29%|██▉       | 7/24 [00:19<00:47,  2.79s/it][A
Processing text column:  33%|███▎      | 8/24 [00:22<00:46,  2.90s/it][A
Processing text column:  38%|███▊      | 9/24 [00:25<00:42,  2.87s/it][A
Processing text column:  42%|████▏     | 10/24 [00:28<00:40,  2.88s/it][A
Processing text column:  46%|████▌     | 11/24 [00:31<00:37,  2.89s/it][A
Processing text column:  50%|█████     | 12/24 [00:33<00:32

Unnamed: 0,title,summary,comment,authors,category,split
0,"[gamma, ##ray, bursts, death, th, ##ro, ##es, ...","[proposed, gamma, ##ray, bursts, created, merg...",[pages],"[ram, ##esh, narayan, bo, ##hd, ##an, pac, ##z...",physics,train
1,"[gravitational, lens, ##ing, variability, g]","[four, ob, ##ser, ##vable, ##s, associated, gr...","[pages, plus, figures, included]","[lawrence, k, ##raus, ##s, martin, white]",physics,test
2,"[pt, ##ole, ##ma, ##ic, gamma, ##ray, burst, u...","[bats, ##e, experiment, gr, ##o, demonstrated,...","[pages, replaced, provide, omitted, line]","[j, katz]",physics,train
3,"[expanding, photos, ##pher, ##es, type, ii, su...","[use, expanding, photos, ##pher, ##e, method, ...",[pages],"[b, p, schmidt, r, p, ki, ##rs, ##hner, r, g, ...",physics,val
4,"[radiation, transfer, gamma, ##ray, bursts]","[calculated, gamma, ##ray, ra, ##dia, ##tive, ...",[pages],"[b, j, carr, ##igan, j, katz]",physics,train


## 03.03 - Vectorize the Data

In [4]:
df = engineer.vectorize(df, batch_size = 10_000)
df.head()

Using 8 GPUs!


Vectorizing columns:   0%|          | 0/4 [00:00<?, ?it/s]
Vectorizing title:   0%|          | 0/48 [00:00<?, ?it/s][A
Vectorizing title:   2%|▏         | 1/48 [00:16<13:02, 16.64s/it][A
Vectorizing title:   4%|▍         | 2/48 [00:25<09:15, 12.08s/it][A
Vectorizing title:   6%|▋         | 3/48 [00:33<07:39, 10.20s/it][A
Vectorizing title:   8%|▊         | 4/48 [00:39<06:20,  8.65s/it][A
Vectorizing title:  10%|█         | 5/48 [00:47<06:03,  8.46s/it][A
Vectorizing title:  12%|█▎        | 6/48 [00:56<06:01,  8.61s/it][A
Vectorizing title:  15%|█▍        | 7/48 [01:03<05:32,  8.10s/it][A
Vectorizing title:  17%|█▋        | 8/48 [01:11<05:22,  8.06s/it][A
Vectorizing title:  19%|█▉        | 9/48 [01:18<04:59,  7.69s/it][A
Vectorizing title:  21%|██        | 10/48 [01:30<05:40,  8.96s/it][A
Vectorizing title:  23%|██▎       | 11/48 [01:37<05:09,  8.36s/it][A
Vectorizing title:  25%|██▌       | 12/48 [01:43<04:37,  7.70s/it][A
Vectorizing title:  27%|██▋       | 13/48 [01:50<

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_emb_758,authors_emb_759,authors_emb_760,authors_emb_761,authors_emb_762,authors_emb_763,authors_emb_764,authors_emb_765,authors_emb_766,authors_emb_767
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,0.02576,0.614236,-0.493632,-0.30018,0.253533,0.20291,-0.438771,-0.174819,0.130876,1.128313
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,0.066363,0.448652,-0.584104,-0.105252,-0.058765,0.153032,-0.549548,-0.246364,0.394485,0.978156
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,0.068674,0.037468,-0.2407,-0.066213,-0.143102,0.041454,-0.192546,-0.282079,0.475934,0.917385
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,0.274846,0.265728,-0.479003,-0.234115,-0.119185,0.3304,-0.390697,-0.215028,0.279082,1.166666
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,0.069123,0.200748,-0.441566,-0.048565,-0.010595,0.094536,-0.413614,-0.156067,0.287172,0.968181


## 03.04 - Word Count

In [5]:
df = engineer.word_count(df, batch_size = 20_000)
df.head()

Counting words:   0%|          | 0/4 [00:00<?, ?it/s]
Processing title:   0%|          | 0/24 [00:00<?, ?it/s][A
Processing title:  33%|███▎      | 8/24 [00:00<00:00, 77.77it/s][A
Processing title:  71%|███████   | 17/24 [00:00<00:00, 83.74it/s][A
Counting words:  25%|██▌       | 1/4 [00:00<00:01,  2.72it/s]    [A
Processing summary:   0%|          | 0/24 [00:00<?, ?it/s][A
Processing summary:   4%|▍         | 1/24 [00:00<00:03,  7.50it/s][A
Processing summary:   8%|▊         | 2/24 [00:00<00:02,  7.94it/s][A
Processing summary:  17%|█▋        | 4/24 [00:00<00:02,  9.60it/s][A
Processing summary:  25%|██▌       | 6/24 [00:00<00:01, 10.89it/s][A
Processing summary:  33%|███▎      | 8/24 [00:00<00:01, 10.65it/s][A
Processing summary:  42%|████▏     | 10/24 [00:00<00:01, 10.22it/s][A
Processing summary:  50%|█████     | 12/24 [00:01<00:01, 11.35it/s][A
Processing summary:  58%|█████▊    | 14/24 [00:01<00:00, 12.18it/s][A
Processing summary:  67%|██████▋   | 16/24 [00:01<00:00

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_emb_762,authors_emb_763,authors_emb_764,authors_emb_765,authors_emb_766,authors_emb_767,title_word_count,summary_word_count,comment_word_count,authors_word_count
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,0.253533,0.20291,-0.438771,-0.174819,0.130876,1.128313,10,109,1,13
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,-0.058765,0.153032,-0.549548,-0.246364,0.394485,0.978156,5,91,4,6
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,-0.143102,0.041454,-0.192546,-0.282079,0.475934,0.917385,8,117,5,2
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,-0.119185,0.3304,-0.390697,-0.215028,0.279082,1.166666,14,217,1,11
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,-0.010595,0.094536,-0.413614,-0.156067,0.287172,0.968181,5,112,1,6


## 03.05 - Named Entity Recognition

In [None]:
df = engineer.named_entity_recognition(df, batch_size = 20_000)
df.head()

Processing NER:   0%|          | 0/4 [00:00<?, ?it/s]
Processing title:   0%|          | 0/24 [00:00<?, ?it/s][A
Processing title:   4%|▍         | 1/24 [01:27<33:31, 87.48s/it][A
Processing title:   8%|▊         | 2/24 [02:59<32:59, 89.96s/it][A
Processing title:  12%|█▎        | 3/24 [04:39<33:12, 94.90s/it][A
Processing title:  17%|█▋        | 4/24 [06:17<32:02, 96.10s/it][A
Processing title:  21%|██        | 5/24 [07:42<29:07, 91.98s/it][A
Processing title:  25%|██▌       | 6/24 [09:10<27:12, 90.68s/it][A
Processing title:  29%|██▉       | 7/24 [10:42<25:46, 90.99s/it][A
Processing title:  33%|███▎      | 8/24 [12:23<25:05, 94.08s/it][A
Processing title:  38%|███▊      | 9/24 [13:45<22:37, 90.47s/it][A
Processing title:  42%|████▏     | 10/24 [15:19<21:20, 91.49s/it][A
Processing title:  46%|████▌     | 11/24 [16:41<19:10, 88.49s/it][A
Processing title:  50%|█████     | 12/24 [17:58<17:00, 85.07s/it][A
Processing title:  54%|█████▍    | 13/24 [19:15<15:08, 82.62s/it][

## 03.06 - Sentiment Analysis

In [None]:
df = engineer.sentiment_analysis(df, batch_size = 20_000)
df.head()

## 03.07 - Text Complexity

In [None]:
df = engineer.text_complexity(df, batch_size = 20_000)
df.head()

## 03.08 - Prepare the Data for Modelling

In [None]:
df = engineer.prepare(df)
df.head()

## 03.09 - Normalize the Data

In [None]:
df, scaler = engineer.normalize_dataframe(df)
df.head()

## 03.10 - Save the Data

In [None]:
save_data(df, f'../data/03_{filename}_engineered.bin')
save_data(scaler, f'../data/03_{filename}_scaler.bin')