## Imports

In [1]:
from sklearn.datasets import fetch_rcv1
import numpy as np

import plotly.express as px

## EDA

In [3]:
rcv1 = fetch_rcv1()

In [4]:
type(rcv1)

### Common information

In [6]:
rcv1.data.shape

(804414, 47236)

In [7]:
rcv1.target.shape

(804414, 103)

In [8]:
rcv1.target_names

array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
       'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
       'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
       'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
       'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
       'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
       'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
       'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
       'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
       'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
       'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
       'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
       'M142', 'M143', 'MCAT'], dtype=object)

### Target distributions

In [9]:
target_counts = np.asarray(rcv1.target.sum(axis=0)).flatten()

top_n = 20
top_indices = target_counts.argsort()[::-1][:top_n]
low_indices = target_counts.argsort()[:top_n]

In [14]:
fig = px.bar(x=rcv1.target_names[top_indices], y=target_counts[top_indices])
fig.update_layout(
    title=f"Top {top_n} categories",
    width=600,
    xaxis_title="Labels",
    yaxis_title="Count"
)
fig.show()

In [15]:
fig = px.bar(x=rcv1.target_names[low_indices], y=target_counts[low_indices])
fig.update_layout(
    title=f"Least {top_n} categories",
    width=600,
    xaxis_title="Labels",
    yaxis_title="Count"
)
fig.show()

### Distributions per document

In [17]:
labels_per_doc = np.asarray(rcv1.target.sum(axis=1)).flatten()

In [29]:
fig = px.histogram(labels_per_doc, nbins=10, )
fig.update_layout(
    title="Distribution per document",
    xaxis_title="Number of labels",
    width=600,
    showlegend=False,
)
fig.update_traces(marker_line_width=1,marker_line_color="black")
fig.show()

### Insights

In [31]:
from scipy.sparse import issparse

print(f"Matrix type: {type(rcv1.data)}")
print(f"Sparesness: {100 * (1.0 - rcv1.data.count_nonzero() / (rcv1.data.shape[0] * rcv1.data.shape[1])):.2f}%")

Matrix type: <class 'scipy.sparse._csr.csr_matrix'>
Sparesness: 99.84%


In [32]:
print(rcv1.data[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 138 stored elements and shape (1, 47236)>
  Coords	Values
  (0, 863)	0.0497399253756197
  (0, 1522)	0.044664135988103
  (0, 1680)	0.0673871572152868
  (0, 2292)	0.0718104827746566
  (0, 2844)	0.0657133637266077
  (0, 2866)	0.0653401708076665
  (0, 3239)	0.0795167845321379
  (0, 4124)	0.0423215276156812
  (0, 4270)	0.0691368598826452
  (0, 4664)	0.0500863047167235
  (0, 5215)	0.252185352537681
  (0, 5572)	0.0672561839956375
  (0, 5698)	0.0594998147298331
  (0, 5793)	0.0737821454910533
  (0, 6221)	0.12450060912141
  (0, 6591)	0.101431159576997
  (0, 7226)	0.194090655513477
  (0, 7974)	0.0766400848671463
  (0, 8144)	0.0295331356836656
  (0, 8758)	0.0595662280181838
  (0, 8770)	0.130789753977649
  (0, 8900)	0.052116236521377
  (0, 8926)	0.0367838394252549
  (0, 8939)	0.0479419428634425
  (0, 9106)	0.0533192746608269
  :	:
  (0, 37134)	0.130451416094855
  (0, 37663)	0.026372919176391
  (0, 37939)	0.0442871530087983
  (0, 39032)	0

## To sum all up



*   Dataset consists of 804414 samples with 47236 represented as TF-IDF vectors for each sample
*   Each target consists of one-hot-encoding vector with `1` on corresponding categories for each sample
*   Class disbalance is presented by plots
*   Each document in major contains 2-3 class labels average

