# Group Project: Arxiv Classifier
Authors: Amir Yunus, Brandon Gay, Lee Oon Teng

In [1]:
from utils import load_data, save_data
import utils.feature_engineering as engineer

# 03 - Feature Engineering

## 03.01 - Load the Data

In [2]:
filename = 'arxiv_balanced'
df = load_data(f'../data/01_{filename}_processed.bin')
df.shape

Unnamed: 0,title,summary,comment,authors,category,split
0,gammaray bursts death throes massive binary stars,proposed gammaray bursts created mergers doubl...,pages,ramesh narayan bohdan paczynski tsvi piran,physics,train
1,gravitational lensing variability g,four observables associated gravitational lens...,pages plus figures included,lawrence krauss martin white,physics,test
2,ptolemaic gammaray burst universe,batse experiment gro demonstrated isotropic ar...,pages replaced provide omitted line,j katz,physics,train
3,expanding photospheres type ii supernovae extr...,use expanding photosphere method determine dis...,pages,b p schmidt r p kirshner r g eastman,physics,val
4,radiation transfer gammaray bursts,calculated gammaray radiative transport region...,pages,b j carrigan j katz,physics,train


(40586, 6)

## 03.02 - Tokenize and Lemmatize the Data

In [3]:
df = engineer.lemmatize(df, batch_size=20560)
df.head()

Using 8 GPUs!


Lemmatizing columns:   0%|          | 0/4 [00:00<?, ?it/s]
Processing text column:   0%|          | 0/2 [00:00<?, ?it/s][A
Processing text column:  50%|█████     | 1/2 [00:02<00:02,  2.64s/it][A
Processing text column: 100%|██████████| 2/2 [00:05<00:00,  2.83s/it][A

Processing text column: 100%|██████████| 2/2 [00:00<00:00, 420.76it/s]
Lemmatizing columns:  25%|██▌       | 1/4 [00:05<00:17,  5.67s/it]
Processing text column:   0%|          | 0/2 [00:00<?, ?it/s][A
Processing text column:  50%|█████     | 1/2 [00:24<00:24, 24.01s/it][A
Processing text column: 100%|██████████| 2/2 [00:53<00:00, 26.51s/it][A

Processing text column: 100%|██████████| 2/2 [00:00<00:00, 380.75it/s]
Lemmatizing columns:  50%|█████     | 2/4 [00:58<01:07, 33.54s/it]
Processing text column:   0%|          | 0/2 [00:00<?, ?it/s][A
Processing text column:  50%|█████     | 1/2 [00:03<00:03,  3.13s/it][A
Processing text column: 100%|██████████| 2/2 [00:05<00:00,  2.60s/it][A

Processing text column: 100%|

Unnamed: 0,title,summary,comment,authors,category,split
0,"[gamma, ##ray, bursts, death, th, ##ro, ##es, ...","[proposed, gamma, ##ray, bursts, created, merg...",[pages],"[ram, ##esh, narayan, bo, ##hd, ##an, pac, ##z...",physics,train
1,"[gravitational, lens, ##ing, variability, g]","[four, ob, ##ser, ##vable, ##s, associated, gr...","[pages, plus, figures, included]","[lawrence, k, ##raus, ##s, martin, white]",physics,test
2,"[pt, ##ole, ##ma, ##ic, gamma, ##ray, burst, u...","[bats, ##e, experiment, gr, ##o, demonstrated,...","[pages, replaced, provide, omitted, line]","[j, katz]",physics,train
3,"[expanding, photos, ##pher, ##es, type, ii, su...","[use, expanding, photos, ##pher, ##e, method, ...",[pages],"[b, p, schmidt, r, p, ki, ##rs, ##hner, r, g, ...",physics,val
4,"[radiation, transfer, gamma, ##ray, bursts]","[calculated, gamma, ##ray, ra, ##dia, ##tive, ...",[pages],"[b, j, carr, ##igan, j, katz]",physics,train


## 03.03 - Vectorize the Data

In [4]:
df = engineer.vectorize(df, batch_size=8224)
df.head()

Using 8 GPUs!


Vectorizing columns:   0%|          | 0/4 [00:00<?, ?it/s]
Vectorizing title:   0%|          | 0/5 [00:00<?, ?it/s][A
Vectorizing title:  20%|██        | 1/5 [00:19<01:18, 19.74s/it][A
Vectorizing title:  40%|████      | 2/5 [00:27<00:38, 12.78s/it][A
Vectorizing title:  60%|██████    | 3/5 [00:34<00:20, 10.20s/it][A
Vectorizing title:  80%|████████  | 4/5 [00:41<00:08,  8.72s/it][A
Vectorizing title: 100%|██████████| 5/5 [00:51<00:00, 10.28s/it][A
Vectorizing columns:  25%|██▌       | 1/4 [00:58<02:56, 58.91s/it]
Vectorizing summary:   0%|          | 0/5 [00:00<?, ?it/s][A
Vectorizing summary:  20%|██        | 1/5 [00:32<02:11, 32.90s/it][A
Vectorizing summary:  40%|████      | 2/5 [00:59<01:27, 29.02s/it][A
Vectorizing summary:  60%|██████    | 3/5 [01:27<00:57, 28.63s/it][A
Vectorizing summary:  80%|████████  | 4/5 [01:57<00:29, 29.35s/it][A
Vectorizing summary: 100%|██████████| 5/5 [02:26<00:00, 29.38s/it][A
Vectorizing columns:  50%|█████     | 2/4 [03:33<03:50, 115.23

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_emb_758,authors_emb_759,authors_emb_760,authors_emb_761,authors_emb_762,authors_emb_763,authors_emb_764,authors_emb_765,authors_emb_766,authors_emb_767
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,0.02576,0.614236,-0.493632,-0.30018,0.253533,0.20291,-0.438771,-0.174819,0.130876,1.128313
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,0.066363,0.448652,-0.584104,-0.105252,-0.058765,0.153032,-0.549548,-0.246364,0.394485,0.978156
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,0.068674,0.037468,-0.2407,-0.066213,-0.143102,0.041454,-0.192546,-0.282079,0.475934,0.917385
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,0.274846,0.265728,-0.479003,-0.234115,-0.119185,0.3304,-0.390697,-0.215028,0.279082,1.166666
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,0.069123,0.200748,-0.441566,-0.048565,-0.010595,0.094536,-0.413614,-0.156067,0.287172,0.968181


## 03.04 - Word Count

In [5]:
df = engineer.word_count(df, batch_size = 1_000)
df.head()

Counting words:   0%|          | 0/4 [00:00<?, ?it/s]
Processing title:   0%|          | 0/41 [00:00<?, ?it/s][A
                                                        [A
Processing summary:   0%|          | 0/41 [00:00<?, ?it/s][A
Processing summary:  49%|████▉     | 20/41 [00:00<00:00, 196.49it/s][A
Processing summary:  98%|█████████▊| 40/41 [00:00<00:00, 183.99it/s][A
Counting words:  50%|█████     | 2/4 [00:00<00:00,  7.60it/s]       [A
Processing comment:   0%|          | 0/41 [00:00<?, ?it/s][A
                                                          [A
Processing authors:   0%|          | 0/41 [00:00<?, ?it/s][A
Counting words: 100%|██████████| 4/4 [00:00<00:00, 12.10it/s]


Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_emb_762,authors_emb_763,authors_emb_764,authors_emb_765,authors_emb_766,authors_emb_767,title_word_count,summary_word_count,comment_word_count,authors_word_count
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,0.253533,0.20291,-0.438771,-0.174819,0.130876,1.128313,10,109,1,13
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,-0.058765,0.153032,-0.549548,-0.246364,0.394485,0.978156,5,91,4,6
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,-0.143102,0.041454,-0.192546,-0.282079,0.475934,0.917385,8,117,5,2
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,-0.119185,0.3304,-0.390697,-0.215028,0.279082,1.166666,14,217,1,11
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,-0.010595,0.094536,-0.413614,-0.156067,0.287172,0.968181,5,112,1,6


## 03.05 - Named Entity Recognition

In [6]:
df = engineer.named_entity_recognition(df, batch_size=1_000)
df.head()

Processing NER:   0%|          | 0/4 [00:00<?, ?it/s]
Processing title:   0%|          | 0/41 [00:00<?, ?it/s][A
Processing title:   2%|▏         | 1/41 [00:03<02:37,  3.94s/it][A
Processing title:   5%|▍         | 2/41 [00:07<02:31,  3.88s/it][A
Processing title:   7%|▋         | 3/41 [00:11<02:26,  3.87s/it][A
Processing title:  10%|▉         | 4/41 [00:15<02:25,  3.93s/it][A
Processing title:  12%|█▏        | 5/41 [00:19<02:20,  3.89s/it][A
Processing title:  15%|█▍        | 6/41 [00:23<02:22,  4.07s/it][A
Processing title:  17%|█▋        | 7/41 [00:28<02:21,  4.17s/it][A
Processing title:  20%|█▉        | 8/41 [00:32<02:19,  4.24s/it][A
Processing title:  22%|██▏       | 9/41 [00:36<02:14,  4.19s/it][A
Processing title:  24%|██▍       | 10/41 [00:40<02:08,  4.14s/it][A
Processing title:  27%|██▋       | 11/41 [00:44<02:02,  4.09s/it][A
Processing title:  29%|██▉       | 12/41 [00:48<01:59,  4.13s/it][A
Processing title:  32%|███▏      | 13/41 [00:52<01:52,  4.01s/it][

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_ner_MONEY_count,authors_ner_NORP_count,authors_ner_ORDINAL_count,authors_ner_ORG_count,authors_ner_PERCENT_count,authors_ner_PERSON_count,authors_ner_PRODUCT_count,authors_ner_QUANTITY_count,authors_ner_TIME_count,authors_ner_WORK_OF_ART_count
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,1,0,0,0,0,0,0,0,0,0
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,0,0,0,0,0,0,0,0,0,0
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,0,0,0,0,0,0,0,0,0,0
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,1,0,0,0,0,0,0,0,0,0
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,1,0,0,0,0,1,0,0,0,0


## 03.06 - Sentiment Analysis

In [7]:
df = engineer.sentiment_analysis(df, batch_size = 1_000)
df.head()

Analyzing sentiment:   0%|          | 0/4 [00:00<?, ?it/s]
Processing title:   0%|          | 0/41 [00:00<?, ?it/s][A
Processing title:   2%|▏         | 1/41 [00:00<00:08,  4.74it/s][A
Processing title:   5%|▍         | 2/41 [00:00<00:07,  5.11it/s][A
Processing title:   7%|▋         | 3/41 [00:00<00:07,  5.23it/s][A
Processing title:  10%|▉         | 4/41 [00:00<00:07,  5.24it/s][A
Processing title:  12%|█▏        | 5/41 [00:00<00:06,  5.29it/s][A
Processing title:  15%|█▍        | 6/41 [00:01<00:06,  5.15it/s][A
Processing title:  17%|█▋        | 7/41 [00:01<00:06,  5.07it/s][A
Processing title:  20%|█▉        | 8/41 [00:01<00:06,  5.02it/s][A
Processing title:  22%|██▏       | 9/41 [00:01<00:06,  5.08it/s][A
Processing title:  24%|██▍       | 10/41 [00:01<00:06,  5.09it/s][A
Processing title:  27%|██▋       | 11/41 [00:02<00:05,  5.13it/s][A
Processing title:  29%|██▉       | 12/41 [00:02<00:05,  5.09it/s][A
Processing title:  32%|███▏      | 13/41 [00:02<00:05,  5.18it

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_ner_PERCENT_count,authors_ner_PERSON_count,authors_ner_PRODUCT_count,authors_ner_QUANTITY_count,authors_ner_TIME_count,authors_ner_WORK_OF_ART_count,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,0,0,0,0,0,0,0.0,-0.024459,0.0,0.0
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,0,0,0,0,0,0,0.0,0.020833,0.0,0.0
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,0,0,0,0,0,0,0.0,0.026833,0.0,0.0
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,0,0,0,0,0,0,0.166667,0.093056,0.0,0.0
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,0,1,0,0,0,0,0.0,0.109,0.0,0.0


## 03.07 - Text Complexity

In [8]:
df = engineer.text_complexity(df, batch_size = 1_000)
df.head()

Calculating text complexity:   0%|          | 0/4 [00:00<?, ?it/s]
Processing title:   0%|          | 0/41 [00:00<?, ?it/s][A
Processing title:  15%|█▍        | 6/41 [00:00<00:00, 59.83it/s][A
Processing title:  29%|██▉       | 12/41 [00:00<00:00, 56.32it/s][A
Processing title:  46%|████▋     | 19/41 [00:00<00:00, 58.42it/s][A
Processing title:  61%|██████    | 25/41 [00:00<00:00, 53.99it/s][A
Processing title:  76%|███████▌  | 31/41 [00:00<00:00, 53.27it/s][A
Processing title:  90%|█████████ | 37/41 [00:00<00:00, 51.86it/s][A
Calculating text complexity:  25%|██▌       | 1/4 [00:00<00:02,  1.29it/s]
Processing summary:   0%|          | 0/41 [00:00<?, ?it/s][A
Processing summary:   2%|▏         | 1/41 [00:00<00:07,  5.53it/s][A
Processing summary:   5%|▍         | 2/41 [00:00<00:07,  5.55it/s][A
Processing summary:   7%|▋         | 3/41 [00:00<00:06,  5.50it/s][A
Processing summary:  10%|▉         | 4/41 [00:00<00:06,  5.44it/s][A
Processing summary:  12%|█▏        | 5/41 [

Unnamed: 0,title,summary,comment,authors,category,split,title_emb_0,title_emb_1,title_emb_2,title_emb_3,...,authors_ner_TIME_count,authors_ner_WORK_OF_ART_count,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment,title_ari,summary_ari,comment_ari,authors_ari
0,"['gamma', '##ray', 'bursts', 'death', 'th', '#...","['proposed', 'gamma', '##ray', 'bursts', 'crea...",['pages'],"['ram', '##esh', 'narayan', 'bo', '##hd', '##a...",physics,train,-0.340067,0.275288,0.076594,-0.272523,...,0,0,0.0,-0.024459,0.0,0.0,14.0,14,14.0,14.0
1,"['gravitational', 'lens', '##ing', 'variabilit...","['four', 'ob', '##ser', '##vable', '##s', 'ass...","['pages', 'plus', 'figures', 'included']","['lawrence', 'k', '##raus', '##s', 'martin', '...",physics,test,-0.115177,0.08105,0.103687,-0.238543,...,0,0,0.0,0.020833,0.0,0.0,14.0,14,14.0,14.0
2,"['pt', '##ole', '##ma', '##ic', 'gamma', '##ra...","['bats', '##e', 'experiment', 'gr', '##o', 'de...","['pages', 'replaced', 'provide', 'omitted', 'l...","['j', 'katz']",physics,train,-0.392568,0.203008,0.196116,-0.296451,...,0,0,0.0,0.026833,0.0,0.0,14.0,14,14.0,7.83
3,"['expanding', 'photos', '##pher', '##es', 'typ...","['use', 'expanding', 'photos', '##pher', '##e'...",['pages'],"['b', 'p', 'schmidt', 'r', 'p', 'ki', '##rs', ...",physics,val,-0.289894,0.23978,0.32229,-0.115017,...,0,0,0.166667,0.093056,0.0,0.0,14.0,14,14.0,12.33
4,"['radiation', 'transfer', 'gamma', '##ray', 'b...","['calculated', 'gamma', '##ray', 'ra', '##dia'...",['pages'],"['b', 'j', 'carr', '##igan', 'j', 'katz']",physics,train,-0.020201,0.34504,-0.169287,-0.099256,...,0,0,0.0,0.109,0.0,0.0,14.0,14,14.0,9.83


## 03.08 - Prepare the Data for Modelling

In [9]:
df = engineer.prepare(df)
df.head()

Unnamed: 0,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,title_emb_7,title_emb_8,title_emb_9,...,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment,title_ari,summary_ari,comment_ari,authors_ari,category,split
0,-0.340067,0.275288,0.076594,-0.272523,-0.307306,0.349464,0.359581,0.88501,-0.355955,-0.056723,...,0.0,-0.024459,0.0,0.0,14.0,14,14.0,14.0,physics,train
1,-0.115177,0.08105,0.103687,-0.238543,-0.431923,0.210272,0.157446,0.632112,-0.503011,-0.071692,...,0.0,0.020833,0.0,0.0,14.0,14,14.0,14.0,physics,test
2,-0.392568,0.203008,0.196116,-0.296451,-0.194015,0.356909,0.407645,0.76641,-0.616156,0.025784,...,0.0,0.026833,0.0,0.0,14.0,14,14.0,7.83,physics,train
3,-0.289894,0.23978,0.32229,-0.115017,0.041579,0.528063,0.223974,0.722394,-0.309294,0.099875,...,0.166667,0.093056,0.0,0.0,14.0,14,14.0,12.33,physics,val
4,-0.020201,0.34504,-0.169287,-0.099256,-0.514406,0.056388,0.380709,0.66466,-0.537006,-0.179937,...,0.0,0.109,0.0,0.0,14.0,14,14.0,9.83,physics,train


## 03.09 - Normalize the Data

In [10]:
df, scaler = engineer.normalize_dataframe(df)
df.head()

Unnamed: 0,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,title_emb_7,title_emb_8,title_emb_9,...,title_sentiment,summary_sentiment,comment_sentiment,authors_sentiment,title_ari,summary_ari,comment_ari,authors_ari,category,split
0,-0.340067,0.275288,0.076594,-0.272523,-0.307306,0.349464,0.359581,0.88501,-0.355955,-0.056723,...,0.5,0.487771,0.444444,0.5,1.0,0.0,1.0,1.0,physics,train
1,-0.115177,0.08105,0.103687,-0.238543,-0.431923,0.210272,0.157446,0.632112,-0.503011,-0.071692,...,0.5,0.510417,0.444444,0.5,1.0,0.0,1.0,1.0,physics,test
2,-0.392568,0.203008,0.196116,-0.296451,-0.194015,0.356909,0.407645,0.76641,-0.616156,0.025784,...,0.5,0.513417,0.444444,0.5,1.0,0.0,1.0,0.483682,physics,train
3,-0.289894,0.23978,0.32229,-0.115017,0.041579,0.528063,0.223974,0.722394,-0.309294,0.099875,...,0.583333,0.546528,0.444444,0.5,1.0,0.0,1.0,0.860251,physics,val
4,-0.020201,0.34504,-0.169287,-0.099256,-0.514406,0.056388,0.380709,0.66466,-0.537006,-0.179937,...,0.5,0.5545,0.444444,0.5,1.0,0.0,1.0,0.651046,physics,train


## 03.10 - Save the Data

In [11]:
save_data(df, f'../data/03_{filename}_engineered.bin')

Data saved successfully to ../data/03_arxiv_balanced_engineered.bin


In [12]:
import pickle

pickle.dump(scaler, open(f'../data/03_{filename}_scaler.bin', 'wb'))