# Appendix
## ADS 509: Final Project Code
## Team 4
### Zachariah Freitas and Brianne Bell
 

In [10]:
# pip install arxiv


In [34]:
# Loading necessary libraries
import pandas as pd
import numpy as np
import os
import re
import random
import time
import datetime

import nltk
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from string import punctuation
from tqdm import tqdm
import sqlite3

# from https://github.com/soumik12345/multi-label-text-classification/blob/master/arxiv_scrape.ipynb
import arxiv


In [37]:
# doing similar set up with setting up keywords to focus on
query_keywords = [
    "\"representation learning\"",
    "\"image generation\"",
    "\"object detection\"",
    "\"transformers\"",
    "\"image segmentation\"",
    "\"natural language\"",
    "\"graph\"",
    "\"colorization\"",
    "\"depth estimation\"",
    "\"point cloud\"",
    "\"structured data\"",
    "\"reinforcement learning\"",
    "\"attention\"",
    "\"tabular\"",
    "\"unsupervised learning\"",
    "\"semi-supervised learning\"",
    "\"explainable\"",
    "\"time series\"",
    "\"molecule\"",
    "\"physics\"",
    "\"graphics\""
]

In [35]:
# https://github.com/soumik12345/multi-label-text-classification/blob/master/arxiv_scrape.ipynb
client = arxiv.Client(num_retries=20, page_size=500)

def query_with_keywords(query):
    search = arxiv.Search(
        query=query,
        max_results=20000,
        sort_by=arxiv.SortCriterion.LastUpdatedDate
    )
    terms = []
    titles = []
    abstracts = []
    for res in tqdm(client.results(search), desc=query):
        if res.primary_category in ["cs.CV", "stat.ML", "cs.LG"]:
            terms.append(res.categories)
            titles.append(res.title)
            abstracts.append(res.summary)
    return terms, titles, abstracts

In [23]:
# setting up save file
if not os.path.isdir("arxiv_data") : 
    os.mkdir("arxiv_data")


In [38]:
# setting up for pull
all_titles = []
all_summaries = []
all_terms = []

# timing:
start_time = datetime.datetime.now()

# pulling
for query in query_keywords:
    terms, titles, abstracts = query_with_keywords(query)
    all_titles.extend(titles)
    all_summaries.extend(abstracts)
    all_terms.extend(terms)
    
# seeing how long ^that took:    
end_time = datetime.datetime.now()
print(end_time - start_time)

"representation learning": 6118it [01:56, 52.62it/s]
"image generation": 1978it [00:43, 45.54it/s]
"object detection": 6536it [02:37, 41.59it/s]
"transformers": 20000it [06:37, 50.38it/s]
"image segmentation": 2890it [00:45, 63.13it/s]
"natural language": 13021it [03:50, 56.38it/s]
"graph": 20000it [06:26, 51.79it/s]
"colorization": 20000it [06:09, 54.08it/s]
"depth estimation": 1218it [00:24, 49.11it/s]
"point cloud": 4308it [01:15, 56.82it/s]
"structured data": 1915it [00:31, 60.03it/s]
"reinforcement learning": 16211it [05:30, 49.04it/s]
"attention": 20000it [06:38, 50.21it/s]
"tabular": 1382it [00:23, 59.73it/s]
"unsupervised learning": 2763it [00:51, 53.37it/s]
"semi-supervised learning": 0it [00:03, ?it/s]
"explainable": 20000it [07:22, 45.20it/s]
"time series": 15302it [05:16, 48.32it/s]
"molecule": 20000it [06:08, 54.31it/s]
"physics": 20000it [08:20, 39.93it/s]
"graphics": 15861it [04:16, 61.95it/s]

1:16:10.736303





In [39]:
raw_data = pd.DataFrame({
    'titles': all_titles,
    'abstracts': all_summaries,
    'terms': all_terms
})

raw_data.head()

Unnamed: 0,titles,abstracts,terms
0,Reinforcement Learning from Multiple Sensors v...,"In many scenarios, observations from more than...",[cs.LG]
1,Interventional Causal Representation Learning,Causal representation learning seeks to extrac...,"[stat.ML, cs.LG]"
2,Self-Supervised Node Representation Learning v...,Self-supervised node representation learning a...,[cs.LG]
3,Out-of-Distribution Representation Learning fo...,Time series classification is an important pro...,"[cs.LG, cs.AI]"
4,Trading Information between Latents in Hierarc...,Variational Autoencoders (VAEs) were originall...,"[stat.ML, cs.CV, cs.IT, cs.LG, math.IT]"


In [40]:
raw_data.shape

(64573, 3)

In [42]:
# saving to csv file because pulling data takes a long while
raw_data.to_csv('arxiv_data.csv', index=False)
    # well I didn't get it to the file folder but it did write.