In [1]:
import os
import pickle
import collections
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import dok_matrix, csr_matrix

In [2]:
datasets_names = ("LSHTC1", "DMOZ", "WIKI_Small", "WIKI_50K", "WIKI_100K")
dataset_dir = "../data"
out_dir = "../data/parsed"

In [2]:
%%time

# Write features

for dataset_name in tqdm(datasets_names):
    dataset_path = os.path.join(dataset_dir, dataset_name)
    train_name, test_name, heldout_name = "bikash_train_remapped.tf", "bikash_test_remapped.tf", \
                                          "bikash_heldout_remapped.tf"

    docs_cnts = collections.defaultdict(int)
    print("Dataset %s" % dataset_name)
    words_cnt = 0
    # Count number of docs and words
    for data_name in ("train", "test", "heldout"):
        data_path = os.path.join(dataset_path, "bikash_%s_remapped.tf" % data_name)
        with open(data_path) as fin:
            for doc in fin:
                tokens_cnts = [int(x.split(":")[0]) for x in doc.split()[1:]]
                words_cnt = max(words_cnt, max(tokens_cnts) + 1)
                docs_cnts[data_name] += 1
    for data_name in ("train", "test", "heldout"):
        print("\t%s size: (%d, %d)" % (data_name, docs_cnts[data_name], words_cnt))
    # Create sparse matrices
    for data_name in ("train", "test", "heldout"):
        data = dok_matrix((docs_cnts[data_name], words_cnt), dtype=np.int32)
        data_path = os.path.join(dataset_path, "bikash_%s_remapped.tf" % data_name)
        with open(data_path) as fin:
            for i, doc in enumerate(fin):
                for token_cnt_str in doc.split()[1:]:
                    j, cnt = map(int, token_cnt_str.split(":"))
                    data[i, j] = cnt
        out_path = os.path.join(out_dir, "%s_%s.dump" % (dataset_name, data_name))
        with open(out_path, "wb") as fout:
            pickle.dump(csr_matrix(data), fout)

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset WIKI_Small
	train size: (796617, 380078)
	test size: (199155, 380078)
	heldout size: (5000, 380078)


 33%|███▎      | 1/3 [08:44<17:29, 524.92s/it]

Dataset WIKI_50K
	train size: (1102754, 951558)
	test size: (276939, 951558)
	heldout size: (5000, 951558)


 67%|██████▋   | 2/3 [21:49<10:54, 654.71s/it]

Dataset WIKI_100K
	train size: (2195530, 1271710)
	test size: (550133, 1271710)
	heldout size: (5000, 1271710)


100%|██████████| 3/3 [50:56<00:00, 1018.79s/it]

CPU times: user 43min 24s, sys: 4min 45s, total: 48min 10s
Wall time: 50min 56s





In [4]:
datasets_names = ("DMOZ",)

In [17]:
%%time

# Write labels

for dataset_name in tqdm(datasets_names):
    dataset_path = os.path.join(dataset_dir, dataset_name)
    train_name, test_name, heldout_name = "bikash_train_remapped.tf", "bikash_test_remapped.tf", \
                                          "bikash_heldout_remapped.tf"

    classes = {}
    print("Dataset %s" % dataset_name)
    # Count number of docs and words
    for data_name in ("train", "test", "heldout"):
        data_path = os.path.join(dataset_path, "bikash_%s_remapped.tf" % data_name)
        with open(data_path) as fin:
            for doc in fin:
                some_id = int(doc.split()[0])
                class_id = classes.get(some_id, len(classes))
                classes[some_id] = class_id
    print("\tclasses count: %d" % len(classes))
    # Create arrays with labels
    for data_name in ("train", "test", "heldout"):
        labels = []
        data_path = os.path.join(dataset_path, "bikash_%s_remapped.tf" % data_name)
        with open(data_path) as fin:
            for doc in fin:
                some_id = int(doc.split()[0])
                class_id = classes[some_id]
                labels.append(class_id)
        out_path = os.path.join(out_dir, "%s_%s_out.dump" % (dataset_name, data_name))
        with open(out_path, "wb") as fout:
            pickle.dump(np.array(labels), fout)

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset DMOZ
	classes count: 27875


100%|██████████| 1/1 [00:13<00:00, 13.42s/it]

CPU times: user 12.3 s, sys: 653 ms, total: 13 s
Wall time: 13.4 s





In [19]:
len(allabels)

27875

In [71]:
# 33, 212: train (849, 824)
# 33, 212: test (220, 198)

In [None]:
# 1927, 2226: train (903, 903)
# 1927, 2226: test (222, 221)

Conversion to pickle protocol=2:

In [74]:
%%time

# Convert pickle protocol=3 to pickle protocol=2 for python2.

for dataset_name in datasets_names:
    for data_name in ("train", "test", "heldout"):
        data3_path = os.path.join(out_dir, "%s_%s.dump" % (dataset_name, data_name))
        data3_out_path = os.path.join(out_dir, "%s_%s_out.dump" % (dataset_name, data_name))
        data2_path = os.path.join(out_dir, "%s_%s_py2.dump" % (dataset_name, data_name))
        data2_out_path = os.path.join(out_dir, "%s_%s_py2_out.dump" % (dataset_name, data_name))
        with open(data3_path, "rb") as fin, open(data2_path, "wb") as fout:
            m = pickle.load(fin)
            pickle.dump(m, fout, protocol=2)
        with open(data3_out_path, "rb") as fin, open(data2_out_path, "wb") as fout:
            m = pickle.load(fin)
            pickle.dump(m, fout, protocol=2)
    break

CPU times: user 754 ms, sys: 996 ms, total: 1.75 s
Wall time: 2.07 s


Conversion to SVM-light format:

In [121]:
%%time

# Convert pickle protocol=3 to SVM-light format.

pos_label = 1927
neg_label = 2226

for dataset_name in datasets_names[::-1]:
    for data_name in ("train", "test", "heldout"):
        data_path = os.path.join(out_dir, "%s_%s.dump" % (dataset_name, data_name))
        data_out_path = os.path.join(out_dir, "%s_%s_out.dump" % (dataset_name, data_name))
        svmlight_path = os.path.join(out_dir, "%s_%s_svmlight.txt" % (dataset_name, data_name))
        with open(data_path, "rb") as fin:
            X = pickle.load(fin)
        with open(data_out_path, "rb") as fin:
            y = pickle.load(fin)
        # Uncommented is one-vs-one,
        # Commented is one-vs-rest
        # X = X[(y == pos_label) | (y == neg_label)]
        # y = y[(y == pos_label) | (y == neg_label)]
        X.sort_indices()
        with open(svmlight_path, "w") as fout:
            for obj, label in zip(X, y):
                obj_str = " ".join(map(lambda p: "%d:%d" % p, zip(obj.indices, obj.data)))
                label_str = "+1" if label == pos_label else "-1"
                fout.write("%s %s\n" % (label_str, obj_str))
    break

CPU times: user 3min 39s, sys: 4.06 s, total: 3min 44s
Wall time: 3min 52s


---