In [1]:

import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import typing as T
import re
import numpy as np
from dataclasses import dataclass

from shared import bootstrap_accuracy, bootstrap_auc, dataset_local_path, simple_boxplot

RAND = 123456
random.seed(RAND)

# Using 'pandas' to load data now:
df: pd.DataFrame = pd.read_json(
    dataset_local_path("lit-wiki-2020.jsonl.gz"), lines=True
)

In [29]:
# Regular expresssions to grab parts of the text:
WORDS = re.compile(r"(\w+)")
NUMBERS = re.compile(r"(\d+)")


def extract_features(row):
    """
    Given the title and body of a Wikipedia article,
    extract features that might be of use to the 'is literary' task.

    Return named features in a dictionary.
    """
    title = row["title"].lower()
    body = row["body"]

    new_features: T.Dict[str, T.Any] = {}
    words = WORDS.findall(body)
    numbers = [int(x) for x in NUMBERS.findall(body)]

    new_features = {
        "disambig": "disambiguation" in title,
        "page_rank": row["page_rank"],
        "length": len(words),
        # "18xx": sum(1 for x in numbers if 1800 < x <= 1900),
        "list_of": title.startswith('list of'),
    }
    if len(numbers) > 0:
        new_features["mean_n"] = np.mean(numbers)
        new_features["std_n"] = np.std(numbers)

    return new_features


# right now each entry of the dataframe is a dictionary; json_normalize flattenst hat for us.
designed_f = pd.json_normalize(df.apply(extract_features, axis="columns"))

In [30]:
print(designed_f.loc[11])

disambig          False
page_rank           0.0
length             2848
list_of            True
mean_n        51.912088
std_n        292.915494
Name: 11, dtype: object


In [12]:
for i in range(len(df['title'])):
    if 'List of' in df['title'][i]:
        print(i)

11
65
255
290
316
325
345
353
515
550
628
647
665
692
790
897
901
1111
1116
1138
1180
1257
1324
1379
1442
1478
1525
1543
1558
1567
1595
1677
1708
1743
1761
1806
1857
