# 0. Import

In [1]:
import yaml
import numpy as np
import pandas as pd
from tqdm import tqdm
from urllib import parse
from datetime import datetime
from typing import Dict, List, Optional
from collections import Counter

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Collecting the data

In [2]:
from tea_client.http import HttpClient
from tea_client.handler import handler

from paperswithcode import PapersWithCodeClient
from paperswithcode.config import config

## 1.1 Use API to collect

In [3]:
token=None
url=None
url = url or config.server_url
http = HttpClient(
    url=f"{url}/api/v{config.api_version}",
    token=token or "",
    authorization_method=HttpClient.Authorization.token,
)

client = PapersWithCodeClient()

below is earning the data. Takes about an hour, so don't run unless you lost them.

### Retrieving title-only code

In [4]:
get_id = lambda x: x.id
areas_id = list(map(get_id, client.area_list().results))

In [None]:
area_paper_dict = {area: [] for area in areas_id}
for area in areas_id:
    
    print(f"--- Working on {area.capitalize()} ---")
    try:
        task_id_lists = list(map(get_id, client.area_task_list(area).results))
        
    except:
        print(f"**Error occurred with area {area}.**")
        continue
        
    for task_id in task_id_lists:
        
        page_idx, papers = 1, []
        print(f"\tWorking on {task_id.capitalize()}")
        
        while True:
            
            try:
                results = http.get(f"/tasks/{task_id}/papers/?page={page_idx}")['results']
                _tmp = [p['title'] for p in results]
                papers.extend(_tmp)
                page_idx += 1
                
            except:
                print(f"\tSuccessfully loaded task {task_id}, with papers {len(papers)}\n")
                break

            area_paper_dict[area].extend(papers)

--- Working on Adversarial ---


### Retrieving all meta-informations

In [5]:
paper_meta_dict = {}
for area in areas_id:
    
    print(f"--- Working on {area.capitalize()} ---")
    try:
        task_id_lists = list(map(get_id, client.area_task_list(area).results))
        
    except:
        print(f"**Error occurred with area {area}.**")
        continue
        
    for task_id in task_id_lists:
        
        page_idx = 1
        ref = len(paper_meta_dict)
        print(f"\tWorking on {task_id.capitalize()}")
        
        while True:
            
            try:
                results = http.get(f"/tasks/{task_id}/papers/?page={page_idx}")['results']
                for p in results:
                    paper_meta_dict[p['title']] = p
                page_idx += 1
                
            except:
                print(f"\tSuccessfully loaded task {task_id}, with papers {len(paper_meta_dict) - ref}\n")
                break

--- Working on Adversarial ---
	Working on Website-fingerprinting-defense
	Successfully loaded task website-fingerprinting-defense, with papers 1

	Working on Adversarial-attack
	Successfully loaded task adversarial-attack, with papers 593

	Working on Adversarial-defense
	Successfully loaded task adversarial-defense, with papers 116

	Working on Real-world-adversarial-attack
	Successfully loaded task real-world-adversarial-attack, with papers 0

	Working on Provable-adversarial-defense
	Successfully loaded task provable-adversarial-defense, with papers 0

	Working on Adversarial-text
	Successfully loaded task adversarial-text, with papers 28

	Working on Data-poisoning
	Successfully loaded task data-poisoning, with papers 122

	Working on Website-fingerprinting-attacks
	Successfully loaded task website-fingerprinting-attacks, with papers 6

	Working on Inference-attack
	Successfully loaded task inference-attack, with papers 58

--- Working on Audio ---
	Working on Shooter-localization

	Successfully loaded task fine-grained-action-recognition, with papers 14

	Working on Camera-auto-calibration
	Successfully loaded task camera-auto-calibration, with papers 4

	Working on Saliency-detection
	Successfully loaded task saliency-detection, with papers 234

	Working on Video-story-qa
	Successfully loaded task video-story-qa, with papers 3

	Working on 3d-object-super-resolution
	Successfully loaded task 3d-object-super-resolution, with papers 2

	Working on Multi-object-discovery
	Successfully loaded task multi-object-discovery, with papers 1

	Working on Road-damage-detection
	Successfully loaded task road-damage-detection, with papers 12

	Working on Image-classification
	Successfully loaded task image-classification, with papers 3581

	Working on Visual-tracking
	Successfully loaded task visual-tracking, with papers 323

	Working on 3d-object-reconstruction
	Successfully loaded task 3d-object-reconstruction, with papers 61

	Working on Occluded-face-detection
	Successfu

	Successfully loaded task dynamic-link-prediction, with papers 0

	Working on Graph-regression
	Successfully loaded task graph-regression, with papers 9

	Working on Local-community-detection
	Successfully loaded task local-community-detection, with papers 0

	Working on Link-sign-prediction
	Successfully loaded task link-sign-prediction, with papers 1

	Working on Graph-question-answering
	Successfully loaded task graph-question-answering, with papers 3

	Working on Graph-mining
	Successfully loaded task graph-mining, with papers 38

	Working on Hypergraph-embedding
	Successfully loaded task hypergraph-embedding, with papers 1

	Working on Knowledge-graph-embedding
	Successfully loaded task knowledge-graph-embedding, with papers 0

--- Working on Knowledge-base ---
	Working on Temporal-knowledge-graph-completion
	Successfully loaded task temporal-knowledge-graph-completion, with papers 5

	Working on Open-knowledge-graph-embedding
	Successfully loaded task open-knowledge-graph-embeddi

	Successfully loaded task diabetic-retinopathy-grading, with papers 13

	Working on Colon-cancer-detection-in-confocal-laser
	Successfully loaded task colon-cancer-detection-in-confocal-laser, with papers 1

	Working on Sleep-micro-event-detection
	Successfully loaded task sleep-micro-event-detection, with papers 1

	Working on Readmission-prediction
	Successfully loaded task readmission-prediction, with papers 10

--- Working on Methodology ---
	Working on Probabilistic-programming
	Successfully loaded task probabilistic-programming, with papers 161

	Working on Efficient-exploration
	Successfully loaded task efficient-exploration, with papers 179

	Working on Continual-learning
	Successfully loaded task continual-learning, with papers 452

	Working on Network-embedding
	Successfully loaded task network-embedding, with papers 125

	Working on Point-processes
	Successfully loaded task point-processes, with papers 280

	Working on Model-selection
	Successfully loaded task model-selectio

	Successfully loaded task sequential-correlation-estimation, with papers 2

	Working on Seismic-interpretation
	Successfully loaded task seismic-interpretation, with papers 12

	Working on Oceanic-eddy-classification
	Successfully loaded task oceanic-eddy-classification, with papers 1

	Working on Mobile-security
	Successfully loaded task mobile-security, with papers 4

	Working on Multi-modal-person-identification
	Successfully loaded task multi-modal-person-identification, with papers 2

	Working on Self-organized-clustering
	Successfully loaded task self-organized-clustering, with papers 1

	Working on Recipe-generation
	Successfully loaded task recipe-generation, with papers 10

	Working on Traffic-classification
	Successfully loaded task traffic-classification, with papers 20

	Working on Md17
	Successfully loaded task md17, with papers 0

	Working on Gravitational-wave-detection
	Successfully loaded task gravitational-wave-detection, with papers 9

	Working on Multi-modal
	Succes

	Successfully loaded task named-entity-recognition-ner, with papers 886

	Working on Text-generation
	Successfully loaded task text-generation, with papers 1153

	Working on Turning-point-identification
	Successfully loaded task turning-point-identification, with papers 2

	Working on Chinese-word-segmentation
	Successfully loaded task chinese-word-segmentation, with papers 143

	Working on Task-oriented-dialogue-systems
	Successfully loaded task task-oriented-dialogue-systems, with papers 93

	Working on Machine-reading-comprehension
	Successfully loaded task machine-reading-comprehension, with papers 2

	Working on Definition-extraction
	Successfully loaded task definition-extraction, with papers 7

	Working on Extracting-covid-19-events-from-twitter
	Successfully loaded task extracting-covid-19-events-from-twitter, with papers 2

	Working on Polyphone-disambiguation
	Successfully loaded task polyphone-disambiguation, with papers 5

	Working on Taxonomy-learning
	Successfully loaded 

	Successfully loaded task vision-based-navigation-with-language-based, with papers 1

	Working on Voice-assistant
	Successfully loaded task voice-assistant, with papers 17

--- Working on Speech ---
	Working on Acoustic-echo-cancellation
	Successfully loaded task acoustic-echo-cancellation, with papers 17

	Working on Speech-separation
	Successfully loaded task speech-separation, with papers 102

	Working on Acoustic-unit-discovery
	Successfully loaded task acoustic-unit-discovery, with papers 8

	Working on Speaker-recognition
	Successfully loaded task speaker-recognition, with papers 163

	Working on Acoustic-question-answering
	Successfully loaded task acoustic-question-answering, with papers 1

	Working on End-to-end-speech-recognition
	Successfully loaded task end-to-end-speech-recognition, with papers 168

	Working on Speaker-diarization
	Successfully loaded task speaker-diarization, with papers 78

	Working on Spoken-dialogue-systems
	Successfully loaded task spoken-dialogue-sys

	Successfully loaded task eeg-decoding, with papers 0

	Working on Covid-19-modelling
	Successfully loaded task covid-19-modelling, with papers 0

	Working on Predictive-process-monitoring
	Successfully loaded task predictive-process-monitoring, with papers 9

	Working on Stock-price-prediction
	Successfully loaded task stock-price-prediction, with papers 37

	Working on Trajectory-modeling
	Successfully loaded task trajectory-modeling, with papers 5

	Working on Edge-computing
	Successfully loaded task edge-computing, with papers 216

	Working on Clustering-multivariate-time-series
	Successfully loaded task clustering-multivariate-time-series, with papers 1

	Working on Sequential-skip-prediction
	Successfully loaded task sequential-skip-prediction, with papers 3

	Working on Covid-19-tracking
	Successfully loaded task covid-19-tracking, with papers 0

	Working on Unsupervised-spatial-clustering
	Successfully loaded task unsupervised-spatial-clustering, with papers 3

	Working on Port

saving the data

In [13]:
# SAVE DICT TO YAML
with open('paperswithcode_meta.yml', 'w') as y:
    yaml.dump(paper_meta_dict, y)

In [14]:
paper_meta_dict['"Brilliant AI Doctor" in Rural China: Tensions and Challenges in AI-Powered CDSS Deployment']

{'id': 'brilliant-ai-doctor-in-rural-china-tensions',
 'arxiv_id': '2101.01524',
 'nips_id': None,
 'url_abs': 'https://arxiv.org/abs/2101.01524v2',
 'url_pdf': 'https://arxiv.org/pdf/2101.01524v2.pdf',
 'title': '"Brilliant AI Doctor" in Rural China: Tensions and Challenges in AI-Powered CDSS Deployment',
 'abstract': 'Artificial intelligence (AI) technology has been increasingly used in the implementation of advanced Clinical Decision Support Systems (CDSS). Research demonstrated the potential usefulness of AI-powered CDSS (AI-CDSS) in clinical decision making scenarios. However, post-adoption user perception and experience remain understudied, especially in developing countries. Through observations and interviews with 22 clinicians from 6 rural clinics in China, this paper reports the various tensions between the design of an AI-CDSS system ("Brilliant Doctor") and the rural clinical context, such as the misalignment with local context and workflow, the technical limitations and us

In [7]:
# # SAVE DICT TO YAML
# with open('paperswithcode_title.yml', 'w') as y:
#     yaml.dump(area_paper_dict, y)

## 1.2 Convert to .csv

In [9]:
# with open('paperswithcode_title.yml', 'r') as y:
#     paper_dict = yaml.load(y)

  paper_dict = yaml.load(y)


In [64]:
_area = list(paper_dict.keys())
_zeros = np.zeros_like(_area, dtype='uint8')
paper_onehot_dict = dict()

In [65]:
for idx, (area, papers) in enumerate(tqdm(paper_dict.items())):
    
    for p in papers:
        
        if paper_onehot_dict.get(p) is None:
            one_hot = _zeros.copy()
            one_hot[idx] += 1
            paper_onehot_dict[p] = one_hot            
        
        else:
            paper_onehot_dict[p][idx] += 1

100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  7.14it/s]


In [66]:
paper_area_df = pd.DataFrame(paper_onehot_dict, index=paper_dict.keys()).T
paper_area_df.to_csv('./paperswithtopic.csv')

# 2. Check the data

Load Data

In [123]:
paper_df = pd.read_csv('paperswithtopic.csv', index_col=0)
paper_df[paper_df >= 1] = 1
paper_df = paper_df.astype({c: 'int8' for c in paper_df.columns})

Make X

In [124]:
paper2idx = {p: i for p, i in enumerate(paper_df.index)}
X_raw = list(map(str.lower, paper2idx.values()))

Make y (labels)

In [146]:
labels = paper_df.reset_index(drop=True)

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, labels, test_size=.1)

## 2.1 Basic EDA

### 2.1.1 Unique number of papers

In [126]:
print(f"# Papers: {paper_df.shape[0]}\n# Areas : {paper_df.shape[1]}")

# Papers: 49980
# Areas : 16


### 2.1.2 Label number

In [127]:
paper_sum = paper_df.sum(axis=1)

In [128]:
Counter(paper_sum.values)

Counter({1: 40694, 2: 8297, 3: 932, 4: 53, 5: 3, 7: 1})

# 3. Generalized format for every models

In [129]:
from transformers import AutoTokenizer

In [130]:
MODEL_NAME = 'bert-large-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [285]:
X_train_tokenized = tokenizer(X_train, padding=True, add_special_tokens=False,
                              max_length=60, truncation=True)
X_test_tokenized = tokenizer(X_test, padding=True, add_special_tokens=False,
                             max_length=60, truncation=True)

In [275]:
class PaperDataset(Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels.loc[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [277]:
train_dataset = PaperDataset(X_train_tokenized, y_train)
test_dataset = PaperDataset(X_test_tokenized, y_test)

In [278]:
train_dataloader = DataLoader(train_dataset, batch_size=16)

In [279]:
batch = next(iter(train_dataloader))

In [280]:
batch['input_ids'].shape

torch.Size([16, 65])

https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/

https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5

In [281]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

In [286]:
X_train2token_np = np.array(X_train_tokenized['input_ids'], dtype=float)
X_test2token_np = np.array(X_test_tokenized['input_ids'], dtype=float)

In [287]:
classifier.fit(X_train2token_np, y_train.values)

BinaryRelevance(classifier=GaussianNB(), require_dense=[True, True])

In [288]:
predictions = classifier.predict(X_test2token_np)

In [289]:
X_train2token_np.shape, X_test2token_np.shape

((44982, 60), (4998, 60))

In [290]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test.values, predictions)

0.00020008003201280514

In [291]:
accuracy_score(y_train.values, classifier.predict(X_train2token_np))

6.669334400426837e-05

In [272]:
predictions.toarray()

array([[1, 1, 0, ..., 0, 1, 1],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int8)

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [292]:
clf = GaussianNB()
clf.fit(X_train2token_np, y_train.loc[:, 'adversarial'].values)

GaussianNB()

In [293]:
train_pred = clf.predict(X_train2token_np)

In [294]:
accuracy_score(y_train.loc[:, 'adversarial'].values, train_pred)

0.02914499132986528

In [296]:
train_pred.sum()

44496

In [297]:
import re