In [1]:
import gzip, json
from shared import dataset_local_path, TODO
from dataclasses import dataclass
from typing import Dict, List

In [11]:
@dataclass
class JustWikiPage:
    title: str
    wiki_id: str
    body: str


# Load our pages into this pages list.
pages: List[JustWikiPage] = []
with gzip.open(dataset_local_path("tiny-wiki.jsonl.gz"), "rt") as fp:
    for line in fp:
        entry = json.loads(line)
        pages.append(JustWikiPage(**entry))

In [38]:
@dataclass
class JustWikiLabel:
    wiki_id: str
    is_literary: bool


# Load our judgments/labels/truths/ys into this labels list:
labels: List[JustWikiLabel] = []
with open(dataset_local_path("tiny-wiki-labels.jsonl")) as fp:
    for line in fp:
        entry = json.loads(line)
        labels.append(
            JustWikiLabel(wiki_id=entry["wiki_id"], is_literary=entry["truth_value"])
        )


@dataclass
class JoinedWikiData:
    wiki_id: str
    is_literary: bool
    title: str
    body: str

    def __str__(self):
        print('wiki_id:', self.wiki_id)
        print('is_literary:', self.is_literary)
        print('title:', self.title)
        print('body:', self.body)
        return ''

In [39]:
joined_data: Dict[str, JoinedWikiData] = {}
pages.sort(key = lambda x: x.wiki_id)
labels.sort(key = lambda x: x.wiki_id)

for i in range(len(pages)):
    joined_data[i] = JoinedWikiData(wiki_id = pages[i].wiki_id,
                                    is_literary = labels[i].is_literary,
                                    title = pages[i].title,
                                    body = pages[i].body)

In [40]:
print(joined_data[1])

wiki_id: enwiki:%C3%81ngeles%20sin%20para%C3%ADso
is_literary: False
title: Ángeles sin paraíso
body: Ángeles sin paraíso (English title: Angels without paradise) is a Mexican Children's telenovela produced by Pedro Damián for Televisa in 1992. This telenovela is remembered by the public as one of the first children's Mexican telenovela.

Anahí and Felipe Colombo starred as child protagonists, while Patricia Bernal starred as main antagonist.

Category:1992 telenovelasCategory:Mexican telenovelasCategory:1992 Mexican television series debutsCategory:1993 Mexican television series endingsCategory:Spanish-language telenovelasCategory:Television shows set in MexicoCategory:Televisa telenovelasCategory:Children's telenovelas



In [25]:

# Make sure it is solved correctly!
assert len(joined_data) == len(pages)
assert len(joined_data) == len(labels)
# Make sure it has *some* positive labels!
assert sum([1 for d in joined_data.values() if d.is_literary]) > 0
# Make sure it has *some* negative labels!
assert sum([1 for d in joined_data.values() if not d.is_literary]) > 0

# Construct our ML problem:
ys = []
examples = []
for wiki_data in joined_data.values():
    ys.append(wiki_data.is_literary)
    examples.append(wiki_data.body)

## We're actually going to split before converting to features now...
from sklearn.model_selection import train_test_split
import numpy as np

RANDOM_SEED = 1234

## split off train/validate (tv) pieces.
ex_tv, ex_test, y_tv, y_test = train_test_split(
    examples,
    ys,
    train_size=0.75,
    shuffle=True,
    random_state=RANDOM_SEED,
)
# split off train, validate from (tv) pieces.
ex_train, ex_vali, y_train, y_vali = train_test_split(
    ex_tv, y_tv, train_size=0.66, shuffle=True, random_state=RANDOM_SEED
)

## Convert to features, train simple model (TFIDF will be explained eventually.)
from sklearn.feature_extraction.text import TfidfVectorizer

# Only learn columns for words in the training data, to be fair.
word_to_column = TfidfVectorizer(
    strip_accents="unicode", lowercase=True, stop_words="english", max_df=0.5
)
word_to_column.fit(ex_train)

# Test words should surprise us, actually!
X_train = word_to_column.transform(ex_train)
X_vali = word_to_column.transform(ex_vali)
X_test = word_to_column.transform(ex_test)


print("Ready to Learn!")
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

models = {
    "SGDClassifier": SGDClassifier(),
    "Perceptron": Perceptron(),
    "LogisticRegression": LogisticRegression(),
    "DTree": DecisionTreeClassifier(),
}

for name, m in models.items():
    m.fit(X_train, y_train)
    print("{}:".format(name))
    print("\tVali-Acc: {:.3}".format(m.score(X_vali, y_vali)))
    if hasattr(m, "decision_function"):
        scores = m.decision_function(X_vali)
    else:
        scores = m.predict_proba(X_vali)[:, 1]
    print("\tVali-AUC: {:.3}".format(roc_auc_score(y_score=scores, y_true=y_vali)))

Ready to Learn!
SGDClassifier:
	Vali-Acc: 0.858
	Vali-AUC: 0.898
Perceptron:
	Vali-Acc: 0.827
	Vali-AUC: 0.877
LogisticRegression:
	Vali-Acc: 0.802
	Vali-AUC: 0.9
DTree:
	Vali-Acc: 0.747
	Vali-AUC: 0.719
