# Exploratory Data Analysis of the OGBN-ARXIV Dataset

This notebook provides a simple EDA of the `ogbn-arxiv` dataset from the Open Graph Benchmark (OGB).

In [14]:
from ogb.nodeproppred import NodePropPredDataset
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import torch

warnings.filterwarnings("ignore", message=".*weights_only=False.*")

## Load Dataset

In [16]:
dataset = NodePropPredDataset(name="ogbn-arxiv", root="../../data/ogbn")
graph, labels = dataset[0]

RecursionError: maximum recursion depth exceeded

In [3]:
node_feat = pd.DataFrame(data=graph['node_feat'])
edge_feat = pd.DataFrame(data=graph['edge_feat'])
edge_index = pd.DataFrame(data=graph['edge_index'])
node_year = pd.DataFrame(data=graph['node_year'])
node_feat['label'] = labels.flatten()

In [4]:
node_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,label
0,-0.057943,-0.05253,-0.072603,-0.026555,0.130435,-0.241386,-0.449242,-0.018443,-0.087218,0.11232,...,-0.226118,-0.185603,0.05323,0.332873,0.104175,0.007408,0.173364,-0.172796,-0.140059,4
1,-0.1245,-0.070665,-0.325202,0.007779,-0.001559,0.074189,-0.191013,0.049689,0.026369,0.099364,...,0.052926,-0.258378,0.021567,0.281503,-0.173423,0.202082,0.068524,-0.372111,-0.301036,5
2,-0.080242,-0.023328,-0.183787,-0.180707,0.075765,-0.125818,-0.394573,-0.219078,-0.108931,0.056966,...,-0.070291,-0.177562,-0.214012,0.182186,-0.121589,-0.073642,0.109919,0.117589,-0.139883,28
3,-0.145044,0.054915,-0.126666,0.039971,-0.055909,-0.101278,-0.339202,-0.115801,-0.080058,-0.001633,...,0.042735,0.066338,-0.226921,0.188418,-0.017295,0.063449,0.017816,0.085364,-0.081804,8
4,-0.071154,0.070766,-0.281432,-0.161892,-0.165246,-0.029116,-0.338593,-0.138727,0.100015,0.132794,...,0.047475,-0.263795,0.026462,0.376349,-0.253772,0.084472,0.098033,-0.075347,-0.111687,27


In [5]:
edge_feat.head()

In [6]:
edge_index.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1166233,1166234,1166235,1166236,1166237,1166238,1166239,1166240,1166241,1166242
0,104447,15858,107156,107156,107156,107156,141536,141536,141536,141536,...,45118,45118,45118,45118,45118,45118,45118,45118,45118,45118
1,13091,47283,69161,136440,107366,158460,90124,121740,122427,161023,...,144525,49499,29457,144665,135414,79124,147994,162473,162537,72717


In [7]:
node_year.head()

Unnamed: 0,0
0,2013
1,2015
2,2014
3,2014
4,2014


In [None]:
edges = pd.read_csv()

## Summary Statistics and View Data

In [None]:
node_feat.info()

In [None]:
node_feat.head()

## Feature stats

In [None]:
node_feat.describe()

## Missing values

In [None]:
node_feat.isna().sum().sum()

## Target distribution

In [None]:
node_feat['label'].value_counts().sort_index().plot(kind='bar')
plt.title("Target distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

## Embedding Visualization

In [None]:
sample_size = 5000

X = node_feat.drop('label', axis=1).values
y = node_feat['label'].values

idx = np.random.choice(X.shape[0], sample_size, replace=False)
X_sample = X[idx]
y_sample = y[idx]

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_2d = tsne.fit_transform(X_sample)

plt.scatter(X_2d[:,0], X_2d[:,1], c=y_sample, cmap="tab20", s=5, alpha=0.7)
plt.title("t-SNE of node features (sampled)")
plt.show()