In [7]:
import json
from dataclasses import dataclass
from typing import Any, Dict, List

import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from shared import dataset_local_path, TODO

In [38]:
examples = []
ys = []

with open(dataset_local_path("poetry_id.jsonl")) as fp:
    for line in fp:
        info = json.loads(line)
        # Note: the data contains a whole bunch of extra stuff; we just want numeric features for now.
        keep = info["features"]
        # whether or not it's poetry is our label.
        ys.append(info["poetry"])
        # hold onto this single dictionary.
        examples.append(keep)

In [40]:
feature_numbering = DictVectorizer(sort=True)
feature_numbering.fit(examples)
X = feature_numbering.transform(examples)
# The above two lines is the same as 
# X = feature_numbering.fit_transform(examples)

print("Features as {} matrix.".format(X.shape))

Features as (2772, 38) matrix.


In [71]:
RANDOM_SEED = 999999

In [67]:
y = np.array(ys)

# First split the data into training and testing
X_tv, X_test, y_tv, y_test = train_test_split(
    X, y, train_size=0.75, shuffle=True, random_state=RANDOM_SEED
)
# Then split the training data into training and validations
X_train, X_vali, y_train, y_vali = train_test_split(
    X_tv, y_tv, train_size=0.66, shuffle=True, random_state=RANDOM_SEED
)

In [69]:
# Decision tree parameters
params = {
    "criterion": "gini",
    "splitter": "best",
    "max_depth": 5,
}

In [70]:
# Number of traning with different models for each one
N_MODELS = 100
# Number of sampling done for one of the models
N_SAMPLES = 100

In [78]:
seed_based_accuracies = []
for randomness in range(N_MODELS):
    # ** unpacks a dictionary into the input of a function call
    f_seed = DecisionTreeClassifier(random_state = RANDOM_SEED + randomness, **params)
    f_seed.fit(X_train, y_train)
    seed_based_accuracies.append(f_seed.score(X_vali, y_vali))