In [1]:
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.svm import SVC, LinearSVC

In [2]:
vectorizer = CountVectorizer(min_df=3, max_df=0.5)

In [3]:
newsgroups = fetch_20newsgroups(subset='all', random_state=0)

In [4]:
%%time

X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

CPU times: user 4.77 s, sys: 97.1 ms, total: 4.87 s
Wall time: 4.9 s


In [5]:
X_train, X_pretest, y_train, y_pretest = train_test_split(X, y, test_size=0.3, random_state=0)

In [6]:
X_heldout, X_test, y_heldout, y_test = train_test_split(X_pretest, y_pretest, test_size=2/3, random_state=0)

In [7]:
out_dir = "../data/parsed"

with open(os.path.join(out_dir, "20newsgroups_train.dump"), "wb") as fout:
    pickle.dump(X_train, fout)
with open(os.path.join(out_dir, "20newsgroups_train_out.dump"), "wb") as fout:
    pickle.dump(y_train, fout)
with open(os.path.join(out_dir, "20newsgroups_heldout.dump"), "wb") as fout:
    pickle.dump(X_heldout, fout)
with open(os.path.join(out_dir, "20newsgroups_heldout_out.dump"), "wb") as fout:
    pickle.dump(y_heldout, fout)
with open(os.path.join(out_dir, "20newsgroups_test.dump"), "wb") as fout:
    pickle.dump(X_test, fout)
with open(os.path.join(out_dir, "20newsgroups_test_out.dump"), "wb") as fout:
    pickle.dump(y_test, fout)

In [None]:
%%time

# Read the dataset.

# dataset_name = "WIKI_Small"
# dataset_name = "DMOZ"
dataset_name = "LSHTC1"
# dataset_name = "20newsgroups"

with open(os.path.join(out_dir, "%s_train.dump" % dataset_name), "rb") as fin:
    X_train = pickle.load(fin)
with open(os.path.join(out_dir, "%s_train_out.dump" % dataset_name), "rb") as fin:
    y_train = pickle.load(fin)
with open(os.path.join(out_dir, "%s_heldout.dump" % dataset_name), "rb") as fin:
    X_heldout = pickle.load(fin)
with open(os.path.join(out_dir, "%s_heldout_out.dump" % dataset_name), "rb") as fin:
    y_heldout = pickle.load(fin)
with open(os.path.join(out_dir, "%s_test.dump" % dataset_name), "rb") as fin:
    X_test = pickle.load(fin)
with open(os.path.join(out_dir, "%s_test_out.dump" % dataset_name), "rb") as fin:
    y_test = pickle.load(fin)

n_classes = 0
for dataset_part in ("train", "heldout", "test"):
    with open(os.path.join(out_dir, "%s_%s_out.dump" % (dataset_name, dataset_part)), "rb") as fin:
        labels = pickle.load(fin)
        n_classes = max(n_classes, max(labels) + 1)

In [8]:
%%time

tfidf = TfidfTransformer()
tfidf.fit(X_train)
X_train = tfidf.transform(X_train, copy=False)
X_heldout = tfidf.transform(X_heldout, copy=False)
X_test = tfidf.transform(X_test, copy=False)

CPU times: user 103 ms, sys: 21.6 ms, total: 124 ms
Wall time: 124 ms


In [9]:
X_train

<13192x52577 sparse matrix of type '<class 'numpy.float64'>'
	with 1717649 stored elements in Compressed Sparse Row format>

---

In [None]:
%%time

clf = SVC(C=1.0, kernel="linear", random_state=0)
clf.fit(X_train, y_train)

In [10]:
%%time

clf = LinearSVC(C=1.0, dual=False, loss='squared_hinge', max_iter=1,
                multi_class='crammer_singer', penalty='l2', random_state=0)
clf.fit(X_train, y_train)

CPU times: user 40 s, sys: 252 ms, total: 40.3 s
Wall time: 42.1 s


---

Train quality:

In [11]:
%time y_pred_train = clf.predict(X_train)

CPU times: user 35.2 ms, sys: 2.45 ms, total: 37.7 ms
Wall time: 36.2 ms


In [12]:
f1_score(y_train, y_pred_train, average="macro")

0.9973810780978599

In [13]:
f1_score(y_train, y_pred_train, average="micro")

0.9974226804123711

Test quality:

In [14]:
%time y_pred_test = clf.predict(X_test)

CPU times: user 18.5 ms, sys: 2.36 ms, total: 20.8 ms
Wall time: 19.2 ms


In [15]:
f1_score(y_test, y_pred_test, average="macro")

0.9206901909395956

In [16]:
f1_score(y_test, y_pred_test, average="micro")

0.9222811671087533

Heldout quality:

In [17]:
%time y_pred_heldout = clf.predict(X_heldout)

CPU times: user 11.8 ms, sys: 1.81 ms, total: 13.6 ms
Wall time: 14.5 ms


In [18]:
f1_score(y_heldout, y_pred_heldout, average="macro")

0.9192936738490263

In [19]:
f1_score(y_heldout, y_pred_heldout, average="micro")

0.9209129511677282

---