In [11]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, csr_matrix
from joblib import dump
from sklearn.model_selection import train_test_split

In [12]:
data_version = 0

In [13]:
data = load_npz(f'./data/gp_comments/data_vectorized_{data_version}.npz')
word_proba = pd.read_csv(f'./data/gp_comments/word_probabilities_{data_version}.csv', index_col=0)
print(data.shape, word_proba.shape)

(18344243, 2511) (2511, 2)


In [14]:
targets = pd.read_csv('./data/gp_comments/targets.csv', index_col=0)
targets.head()

Unnamed: 0,score,labels
0,5,1
6,5,1
8,5,1
9,5,1
10,5,1


In [31]:
_, data_sample = train_test_split(data, test_size=100_000, random_state=42, stratify=targets['score'])

In [32]:
data_sample.shape

(100000, 2511)

In [33]:
def reveal_sparse(sparse_1d):
    if sparse_1d.shape[0] != 1:
        raise Exception(f'1d matrix required, got matrix with shape: {sparse_1d.shape}')

    data_ = sparse_1d.data
    rows_count, cols_count = len(data_), sparse_1d.shape[1]
    col_ind = sparse_1d.indices
    row_ind = list(range(rows_count))

    return csr_matrix((data_, (row_ind, col_ind)), shape=(rows_count, cols_count))

In [34]:
data_revealed = list(map(lambda sparse_1d: reveal_sparse(sparse_1d), data_sample))

In [35]:
probas = word_proba['probability']
def get_labels(sparse_revealed):
    labels = [0] * sparse_revealed.shape[0]

    row_proba = sparse_revealed.dot(probas)
    best_word_idx = np.argmax(row_proba)
    labels[best_word_idx] = 1

    return labels

In [36]:
labels_revealed = list(map(lambda sparse_revealed: get_labels(sparse_revealed), data_revealed))
labels_revealed[0]

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

In [37]:
dump(data_revealed, f'./data/gp_comments/data_vectorized_revealed_100k_{data_version}.sav')

['./data/gp_comments/data_vectorized_revealed_100k_0.sav']

In [38]:
dump(labels_revealed, f'./data/gp_comments/labels_revealed_100k_{data_version}.sav')

['./data/gp_comments/labels_revealed_100k_0.sav']