In [3]:
%matplotlib notebook

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import glasbey

import time
import pickle
import memory_profiler

%load_ext memory_profiler

from pathlib import Path
import distro

%load_ext watermark

In [4]:
# import black
import jupyter_black

jupyter_black.load(line_length=79)

In [5]:
variables_path = Path("../results/variables/iclr25v1")
figures_path = Path("../results/figures")
data_path = Path("../data")

In [7]:
pwd

'/gpfs01/berens/user/rgonzalesmarquez'

In [8]:
# MANUAL FIX TO PATH ISSUE FROM VSCODE
import text_embeddings_src

nb_path = Path("phd/iclr-dataset/scripts")
assert nb_path.exists(), "The path does not exist"

variables_path = (nb_path / variables_path).resolve(strict=True)
figures_path = (nb_path / figures_path).resolve(strict=True)
data_path = (nb_path / data_path).resolve(strict=True)

In [13]:
print(variables_path)

/gpfs01/berens/user/rgonzalesmarquez/phd/iclr-dataset/results/variables/iclr25v1


In [14]:
plt.style.use((nb_path / Path("matplotlib_style.txt")).resolve(strict=True))

In [15]:
%watermark -a 'Rita González-Márquez' -t -d -tz -u -v -iv -w -m -h -p transformers,openTSNE

Author: Rita González-Márquez

Last updated: 2024-10-18 16:27:04CEST

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.18.1

transformers: 4.43.4
openTSNE    : 1.0.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit

Hostname: rgonzalesmarquez_GPU0-llm_gber7

pandas             : 2.1.3
memory_profiler    : 0.61.0
distro             : 1.8.0
text_embeddings_src: 0.0.0
glasbey            : 0.2.0
seaborn            : 0.13.0
jupyter_black      : 0.3.4
matplotlib         : 3.8.2
numpy              : 1.26.2

Watermark: 2.4.3



ICLR new data

# Import

In [16]:
%%time
iclr = pd.read_parquet(
    data_path / "iclr25v1.parquet",
    engine="pyarrow",
)

CPU times: user 261 ms, sys: 109 ms, total: 371 ms
Wall time: 363 ms


In [17]:
iclr.keywords = iclr.keywords.transform(lambda x: list(x))
iclr.scores = iclr.scores.transform(lambda x: list(x))

In [18]:
iclr.head()

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,labels
0,2017,B1-Hhnslg,Prototypical Networks for Few-shot Learning,A recent approach to few-shot classification c...,"Jake Snell, Kevin Swersky, Richard Zemel",Reject,"[6, 4, 5]","[deep learning, transfer learning]",transfer learning
1,2017,B1-q5Pqxl,Machine Comprehension Using Match-LSTM and Ans...,Machine comprehension of text is an important ...,"Shuohang Wang, Jing Jiang",Accept (Poster),"[6, 6, 7]","[natural language processing, deep learning]",language models
2,2017,B16Jem9xe,Learning in Implicit Generative Models,Generative adversarial networks (GANs) provide...,"Shakir Mohamed, Balaji Lakshminarayanan",Invite to Workshop Track,"[8, 7, 6]",[unsupervised learning],unlabeled
3,2017,B16dGcqlx,Third Person Imitation Learning,Reinforcement learning (RL) makes it possible ...,"Bradly C Stadie, Pieter Abbeel, Ilya Sutskever",Accept (Poster),"[6, 5, 6]",[],unlabeled
4,2017,B184E5qee,Improving Neural Language Models with a Contin...,We propose an extension to neural network lang...,"Edouard Grave, Armand Joulin, Nicolas Usunier",Accept (Poster),"[7, 9, 5]",[natural language processing],language models


# Assign keywords

## Lists of keywords and corresponding labels

In [19]:
final_keywords_groups = [
    ###### ADVERSARIAL
    [
        ("adversarial", 60),
        ("adversarial attack", 121),
        ("adversarial attacks", 106),
        ("adversarial defense", 50),
        ("adversarial examples", 196),
        ("adversarial learning", 93),
        ("adversarial machine learning", 54),
        ("adversarial robustness", 241),
        ("adversarial training", 217),
    ],
    ###### TRANSFORMERS
    [
        ("attention", 183),
        ("attention mechanism", 53),
        ("transformer", 340),
        ("transformers", 261),
        ("self-attention", 73),
    ],
    ###### AUTOENCODERS
    [
        ("autoencoder", 63),
        ("autoencoders", 52),
        ("vae", 71),
        ("variational autoencoder", 93),
        ("variational autoencoders", 83),
    ],
    [("anomaly detection", 109)],
    [("causal discovery", 53), ("causal inference", 104), ("causality", 80)],
    [("clustering", 116)],
    [("compression", 121), ("model compression", 135)],
    ###### COMPUTER VISION
    [
        ("object detection", 125),
    ],
    ###### CL
    [("contrastive learning", 344)],
    ###### CNNs
    [
        ("convolutional neural network", 76),
        ("convolutional neural networks", 130),
        ("cnn", 88),
    ],
    ###### DIFFUSION MODELS
    [("diffusion", 69), ("diffusion model", 167), ("diffusion models", 280)],
    ###### EXPLAINABLE AI
    [("explainability", 131), ("explainable ai", 92)],
    [("interpretability", 356)],
    [("fairness", 182)],
    [("federated learning", 485)],
    ###### GANS
    [
        ("generative adversarial network", 70),
        ("generative adversarial networks", 190),
        ("gan", 168),
        ("gans", 91),
    ],
    ###### GRAPH
    [
        ("graph", 48),
        ("graph neural network", 230),
        ("graph neural networks", 563),
        ("graph representation learning", 85),
        ("gnn", 64),
    ],
    ###### LLMS
    [
        ("llm", 80),
        ("large language model", 210),
        ("large language models", 447),
        ("prompting", 48),
    ],
    [("knowledge distillation", 211)],
    [
        (
            "natural language processing",
            433,
        ),
        ("nlp", 166),
        ("language model", 105),
        ("language models", 151),
        ("language modeling", 85),
        ("machine translation", 91),
        ("question answering", 59),
        ("reasoning", 85),
    ],
    ###### META-LEARNING
    [("meta learning", 121), ("meta-learning", 301)],
    [("network pruning", 48), ("pruning", 140)],
    [("neural architecture search", 180)],
    [("optimal transport", 165)],
    ###### OPTIMIZATION
    [
        ("stochastic gradient descent", 77),
        ("stochastic optimization", 56),
        ("sgd", 86),
        ("optimization", 410),
        ("non-convex optimization", 66),
        ("convex optimization", 57),
        ("gradient descent", 86),
        ("combinatorial optimization", 69),
        ("bayesian optimization", 64),
    ],
    ###### OUT-OF-DISTRIBUTION
    [
        ("out-of-distribution", 53),
        ("out-of-distribution detection", 92),
        ("out-of-distribution generalization", 59),
        ("distribution shift", 96),
    ],
    ###### PRIVACY
    [("differential privacy", 154), ("privacy", 99)],
    ###### RNNs
    [
        ("rnn", 65),
        ("recurrent neural network", 48),
        ("recurrent neural networks", 114),
        ("lstm", 66),
    ],
    ###### RL
    [("reinforcement learning", 1608), ("deep reinforcement learning", 298)],
    [("active learning", 131)],
    [("model-based reinforcement learning", 111)],
    [("multi-agent reinforcement learning", 162)],
    [("multi-task learning", 141)],
    [("imitation learning", 171)],
    [("offline reinforcement learning", 150), ("offline rl", 55)],
    [("continual learning", 339), ("lifelong learning", 82)],
    ### NOT RL
    [
        ("in-context learning", 105),
    ],
    [("few-shot learning", 218)],
    [("robustness", 411)],
    [("self-supervised learning", 473)],
    [("semi-supervised learning", 253)],
    [("time series", 129), ("time series forecasting", 54)],
    ###### TRANSFER LEARNING
    [
        ("transfer learning", 388),
        ("domain adaptation", 176),
        ("domain generalization", 124),
    ],
    ###### VISION
    [("vision transformer", 98), ("vision transformers", 51)],
    [("vision-language models", 48), ("clip", 70)],
]

In [20]:
len(final_keywords_groups)

45

In [21]:
dict_keyword_to_label = {
    ###### ADVERSARIAL
    "adversarial": "adversarial",
    "adversarial attack": "adversarial",
    "adversarial attacks": "adversarial",
    "adversarial defense": "adversarial",
    "adversarial examples": "adversarial",
    "adversarial learning": "adversarial",
    "adversarial machine learning": "adversarial",
    "adversarial robustness": "adversarial",
    "adversarial training": "adversarial",
    ###### TRANSFORMERS
    "attention": "transformers",
    "attention mechanism": "transformers",
    "transformer": "transformers",
    "transformers": "transformers",
    "self-attention": "transformers",
    ###### AUTOENCODERS
    "autoencoder": "autoencoders",
    "autoencoders": "autoencoders",
    "vae": "autoencoders",
    "variational autoencoder": "autoencoders",
    "variational autoencoders": "autoencoders",
    ######
    "anomaly detection": "anomaly detection",
    ###### CAUSALITY
    "causal discovery": "causality",
    "causal inference": "causality",
    "causality": "causality",
    ######
    "clustering": "clustering",
    ###### COMPRESSION
    "compression": "compression",
    "model compression": "compression",
    ######
    "object detection": "object detection",
    ######
    "contrastive learning": "contrastive learning",
    ###### CNNs
    "convolutional neural network": "CNNs",
    "convolutional neural networks": "CNNs",
    "cnn": "CNNs",
    ###### DIFFUSION MODELS
    "diffusion": "diffusion models",
    "diffusion model": "diffusion models",
    "diffusion models": "diffusion models",
    ###### EXPLAINABILITY
    "explainability": "explainability",
    "explainable ai": "explainability",
    ######
    "interpretability": "interpretability",
    ######
    "fairness": "fairness",
    ######
    "federated learning": "federated learning",
    ###### GANs
    "generative adversarial network": "GANs",
    "generative adversarial networks": "GANs",
    "gan": "GANs",
    "gans": "GANs",
    ###### GRAPHS
    "graph": "graphs",
    "graph neural network": "graphs",
    "graph neural networks": "graphs",
    "graph representation learning": "graphs",
    "gnn": "graphs",
    ###### LLMs
    "llm": "LLMs",
    "large language model": "LLMs",
    "large language models": "LLMs",
    "prompting": "LLMs",
    ######
    "knowledge distillation": "knowledge distillation",
    ###### LANGUAGE MODELS
    "natural language processing": "language models",
    "nlp": "language models",
    "language model": "language models",
    "language models": "language models",
    "language modeling": "language models",
    "machine translation": "language models",
    "question answering": "language models",
    "reasoning": "language models",
    ###### META LEARNING
    "meta learning": "meta learning",
    "meta-learning": "meta learning",
    ###### PRUNING
    "network pruning": "pruning",
    "pruning": "pruning",
    ######
    "neural architecture search": "neural architecture search",
    ######
    "optimal transport": "optimal transport",
    ###### OPTIMIZATION
    "stochastic gradient descent": "optimization",
    "stochastic optimization": "optimization",
    "sgd": "optimization",
    "optimization": "optimization",
    "non-convex optimization": "optimization",
    "convex optimization": "optimization",
    "gradient descent": "optimization",
    "combinatorial optimization": "optimization",
    "bayesian optimization": "optimization",
    ###### OUT-OF-DISTRIBUTION
    "out-of-distribution": "out-of-distribution",
    "out-of-distribution detection": "out-of-distribution",
    "out-of-distribution generalization": "out-of-distribution",
    "distribution shift": "out-of-distribution",
    ###### PRIVACY
    "differential privacy": "privacy",
    "privacy": "privacy",
    ###### RNNs
    "rnn": "RNNs",
    "recurrent neural network": "RNNs",
    "recurrent neural networks": "RNNs",
    "lstm": "RNNs",
    ###### REINFORCEMENT LEARNING
    "reinforcement learning": "RL",
    "deep reinforcement learning": "RL",
    ######
    "active learning": "active learning",
    ######
    "model-based reinforcement learning": "model-based RL",
    ######
    "multi-agent reinforcement learning": "multi-agent RL",
    ######
    "multi-task learning": "multi-task learning",
    ######
    "imitation learning": "imitation learning",
    ###### OFFLINE RL
    "offline reinforcement learning": "offline RL",
    "offline rl": "offline RL",
    ###### CONTINUAL LEARNING
    "continual learning": "continual learning",
    "lifelong learning": "continual learning",
    ######
    "in-context learning": "in-context learning",
    ######
    "few-shot learning": "few-shot learning",
    ######
    "robustness": "robustness",
    ######
    "self-supervised learning": "self-supervised learning",
    ######
    "semi-supervised learning": "semi-supervised learning",
    ###### TIME SERIES
    "time series": "time series",
    "time series forecasting": "time series",
    ###### TRANSFER LEARNING
    "transfer learning": "transfer learning",
    "domain adaptation": "transfer learning",
    "domain generalization": "transfer learning",
    ###### ViTs
    "vision transformer": "ViTs",
    "vision transformers": "ViTs",
    ###### VISION-LANGUAGE MODELS
    "vision-language models": "vision-language models",
    "clip": "vision-language models",
}

In [11]:
pickle_in = open(
    "/gpfs01/berens/user/rgonzalesmarquez/phd/iclr-dataset/results/variables/iclr24v2/dict_label_to_color.pkl",
    "rb",
)
dict_label_to_color = pickle.load(pickle_in)

## Assignment

In [27]:
import itertools


def assign_labels_and_colors(
    data, keywords_and_freqs, dict_keyword_to_label, dict_color_legend=None
):
    """Assign labels and colors from list with lists of keywords.

    Parameters
    ----------
    data: list of lists, len (n_samples)
        List with lists of keywords for every paper.
    keywords_and_freqs: list of lists, len (n_labels)
        List of keywords groups. Contains all keywords and frequencies, with sublists of subgroups of keywords.
    dict_keyword_to_label: dict
        Dictionary assigning to each keyword its label (e.g. to all keywords in same subgroup same label).
    dict_color_legend: dict, len (n_labels)
        Dictionary assigning to each label a color.

    Returns
    -------
    labels: array, shape (n_samples,)
        Label for each paper.
    colors: array, shape (n_samples,)
        Color for each paper.


    """

    # prepare dict_freqs
    dict_freqs = dict(list(itertools.chain.from_iterable(keywords_and_freqs)))
    dict_freqs[
        "unlabeled"
    ] = 1e9  # assign very large value to unlabeled for argmax

    # clean empty lists of keywords from the data
    data_without_empty = [
        ["unlabeled"] if elem == [] else elem for elem in data
    ]

    # choose keywords for each paper
    chosen_keywords = []
    for list_keywords in data_without_empty:
        list_keywords_filtered = [
            elem if elem in set(dict_freqs.keys()) else "unlabeled"
            for elem in list_keywords
        ]

        freqs = np.vectorize(dict_freqs.get)(list_keywords_filtered)

        chosen_keyword = list_keywords_filtered[np.argmin(freqs)]
        chosen_keywords.append(chosen_keyword)

    chosen_keywords = np.array(chosen_keywords)

    # map chosen keywords to labels
    dict_keyword_to_label["unlabeled"] = "unlabeled"
    labels = np.vectorize(dict_keyword_to_label.get)(chosen_keywords)

    # colors
    colors = np.vectorize(dict_color_legend.get)(labels)

    return labels, colors

In [28]:
%%time
labels_iclr, colors_iclr = assign_labels_and_colors(
    iclr.keywords.to_list(),
    final_keywords_groups,
    dict_keyword_to_label,
    dict_label_to_color,
)

CPU times: user 1e+03 ms, sys: 389 ms, total: 1.39 s
Wall time: 1.05 s


In [30]:
# save
np.save(variables_path / "labels_iclr", labels_iclr)
np.save(variables_path / "colors_iclr", colors_iclr)

f = open(variables_path / "dict_label_to_color.pkl", "wb")
pickle.dump(dict_label_to_color, f)
f.close()

In [29]:
print(
    "Percentage of unlabeled papers: ",
    np.sum(labels_iclr == "unlabeled") / len(labels_iclr) * 100,
)
print(
    "Number of unlabeled papers: ",
    np.sum(labels_iclr == "unlabeled"),
)

Percentage of unlabeled papers:  47.309118294519756
Number of unlabeled papers:  16333


In [30]:
print(
    "Papers without any keywords: ",
    np.sum([1 if elem == [] else 0 for elem in iclr.keywords])
    / len(labels_iclr)
    * 100,
)

Papers without any keywords:  6.172517668868034


In [31]:
# Examples of papers with keywords that have not being assigned a label
# We can see that they contain very general or very specific keywords that were filtered out in our selection
iclr.keywords.to_numpy()[labels_iclr == "unlabeled"][:50]

array([list(['unsupervised learning']), list([]),
       list(['unsupervised learning', 'deep learning']), list([]),
       list(['deep learning', 'supervised learning']),
       list(['deep learning', 'unsupervised learning']), list([]),
       list(['theory', 'deep learning']), list([]),
       list(['deep learning', 'multi-modal learning', 'structured prediction']),
       list(['deep learning']), list([]),
       list(['deep learning', 'theory']),
       list(['deep learning', 'unsupervised learning', 'applications']),
       list(['deep learning']), list(['theory', 'deep learning']),
       list([]), list(['deep learning', 'applications']), list([]),
       list([]), list(['deep learning', 'unsupervised learning']),
       list(['deep learning']), list([]),
       list(['deep learning', 'supervised learning']),
       list(['deep learning', 'unsupervised learning']),
       list(['computer vision', 'deep learning']),
       list(['deep learning', 'computer vision']),
       list([

## Add column to dataframe and resave

In [34]:
iclr.head()

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,labels
0,2017,B1-Hhnslg,Prototypical Networks for Few-shot Learning,A recent approach to few-shot classification c...,"Jake Snell, Kevin Swersky, Richard Zemel",Reject,"[6, 4, 5]","[deep learning, transfer learning]",transfer learning
1,2017,B1-q5Pqxl,Machine Comprehension Using Match-LSTM and Ans...,Machine comprehension of text is an important ...,"Shuohang Wang, Jing Jiang",Accept (Poster),"[6, 6, 7]","[natural language processing, deep learning]",language models
2,2017,B16Jem9xe,Learning in Implicit Generative Models,Generative adversarial networks (GANs) provide...,"Shakir Mohamed, Balaji Lakshminarayanan",Invite to Workshop Track,"[8, 7, 6]",[unsupervised learning],unlabeled
3,2017,B16dGcqlx,Third Person Imitation Learning,Reinforcement learning (RL) makes it possible ...,"Bradly C Stadie, Pieter Abbeel, Ilya Sutskever",Accept (Poster),"[6, 5, 6]",[],unlabeled
4,2017,B184E5qee,Improving Neural Language Models with a Contin...,We propose an extension to neural network lang...,"Edouard Grave, Armand Joulin, Nicolas Usunier",Accept (Poster),"[7, 9, 5]",[natural language processing],language models


In [35]:
iclr["labels"] = labels_iclr

In [36]:
iclr.head()

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,labels
0,2017,B1-Hhnslg,Prototypical Networks for Few-shot Learning,A recent approach to few-shot classification c...,"Jake Snell, Kevin Swersky, Richard Zemel",Reject,"[6, 4, 5]","[deep learning, transfer learning]",transfer learning
1,2017,B1-q5Pqxl,Machine Comprehension Using Match-LSTM and Ans...,Machine comprehension of text is an important ...,"Shuohang Wang, Jing Jiang",Accept (Poster),"[6, 6, 7]","[natural language processing, deep learning]",language models
2,2017,B16Jem9xe,Learning in Implicit Generative Models,Generative adversarial networks (GANs) provide...,"Shakir Mohamed, Balaji Lakshminarayanan",Invite to Workshop Track,"[8, 7, 6]",[unsupervised learning],unlabeled
3,2017,B16dGcqlx,Third Person Imitation Learning,Reinforcement learning (RL) makes it possible ...,"Bradly C Stadie, Pieter Abbeel, Ilya Sutskever",Accept (Poster),"[6, 5, 6]",[],unlabeled
4,2017,B184E5qee,Improving Neural Language Models with a Contin...,We propose an extension to neural network lang...,"Edouard Grave, Armand Joulin, Nicolas Usunier",Accept (Poster),"[7, 9, 5]",[natural language processing],language models


In [37]:
# save
iclr.to_parquet(
    data_path / "iclr25v1.parquet",
    index=False,
    engine="pyarrow",
)