In [15]:
f = open("data/philosophy_data.csv", "r").read()

In [16]:
lines = f.split("\n")

In [45]:
import re
from collections import defaultdict
from tqdm import tqdm

PATTERN = r',(?=(?:[^"]*"[^"]*")*[^"]*$)'

attr_handler: dict[str, callable] = {
    "title": lambda x: str(x).lower(),
    "author": lambda x: str(x).lower(),
    "school": lambda x: str(x).lower(),
    "sentence_spacy": lambda x: str(x.replace('"', "").strip()),
    "sentence_str":  lambda x: str(x.replace('"', "").strip()),
    "original_publication_date": int,
    "corpus_edition_date": int,
    "sentence_length": int,
    "sentence_lowered": lambda x: str(x.replace('"', "").strip()),
    "tokenized_txt": lambda x: list(x[2:-2].replace("\'", "").split(", ")),
    "lemmatized_str": lambda x: str(x[1:-1])
}

def split_lines(lines):
    headers = lines[0].split(",")
    jsonl = []
    for line in tqdm(lines[1:]):
        split_line = re.split(PATTERN, line)
        if len(headers) != len(split_line):
            continue
        di = {header: attr_handler[header](split_line[i]) for i, header in enumerate(headers)}
        jsonl.append({
            "title": di["title"],
            "author": di["author"],
            "school": di["school"],
            "sentence": di["sentence_lowered"]
        })
    return jsonl

In [46]:
di_lines = split_lines(lines)

100%|██████████| 396428/396428 [00:50<00:00, 7906.68it/s] 


In [47]:
di_lines[0]

{'title': 'plato - complete works',
 'author': 'plato',
 'school': 'plato',
 'sentence': "what's new, socrates, to make you leave your usual haunts in the lyceum and spend your time here by the king archon's court?"}

In [48]:
import jsonlines
with jsonlines.open("./data/index.jsonl", mode="w") as writer:
    writer.write(di_lines)

In [64]:
import jsonlines
examples = None
with jsonlines.open("./data/index.jsonl", mode="r") as reader:
    examples = [li for li in reader][0]

In [50]:
from collections import defaultdict

def partition(examples, key):
    part = defaultdict(list)
    for example in examples:
        if key not in example:
            continue
        part[example[key]].append(example)
    return part

In [7]:
schools = partition(examples, "school")

In [18]:
def group(part, k):
    groups = []
    for g in part:
        print(g)
        for i in range(len(part[g]) - k):
            groups.append(part[g][i:i+k])
    return groups

In [51]:
from collections import defaultdict

di = defaultdict(set)
for ex in examples:
    di[ex["author"]].add(ex["school"])

In [53]:
sentences = [ex["sentence"] for ex in examples]

In [19]:
schools_len_k = group(schools, 5)

plato
aristotle
empiricism
rationalism
analytic
continental
phenomenology
german_idealism
communism
capitalism
stoicism
nietzsche
feminism
scholasticism
Kierkegaard
Hobbes
Existentialism
Daoism


In [21]:
".".join([s["sentence_lowered"] for s in schools_len_k[1]])

'urely you are not prosecuting anyone before the king archon as i am.the athenians do not call this a prosecution but an indictment, euthyphro..hat is this you say.someone must have indicted you, for you are not going to tell me that you have indicted someone else..ut someone else has indicted you'

In [66]:
import os
import jsonlines

def load_bo():
    fps = os.listdir("bo")
    fps = [fp for fp in fps if fp.split(".")[-1] == "jsonl" and "sentence" in fp.split("_")[-1]]
    text = []
    for fp in fps:
        with jsonlines.open(f"./bo/{fp}", "r") as reader:
            li = [ex for ex in reader]
            text.append(li)
    return text

In [80]:
def reformat_bo(books):
    rbks = []
    for book in books:
        rex = []
        title = str(book[0]).split(":")[-1].strip().lower()
        author = str(book[1]).split(":")[-1].strip().lower()
        school = list(di.get(author, ""))
        for b in book[2:]:
            if b["sentence"] in sentences: continue
            rex.append({
                "title": title,
                "author": author,
                "school": school[0] if school else "",
                "sentence": b["sentence"].lower()
            })
        rbks.append(rex)
    return rbks

In [81]:
text = load_bo()
rbks = reformat_bo(text)

In [83]:
rbks[1]

[{'title': 'the logic of hegel',
  'author': 'hegel',
  'school': 'german_idealism',
  'sentence': 'now, although this relation does undoubtedly belong to necessity, it forms only one aspect in the process of that category.'},
 {'title': 'the logic of hegel',
  'author': 'hegel',
  'school': 'german_idealism',
  'sentence': 'to determine which of these aspects is more essential than another, again, requires a further syllogism of this kind, which fixing on the single quality can with equal ease discover in it some aspect or consideration by which it can make good its claims to be considered necessary and important.'},
 {'title': 'the logic of hegel',
  'author': 'hegel',
  'school': 'german_idealism',
  'sentence': 'in this lies its formalism.'},
 {'title': 'the logic of hegel',
  'author': 'hegel',
  'school': 'german_idealism',
  'sentence': 'in this category of general agreement there was latent the deep-rooted perception, which does not escape even the least cultivated mind, that t

In [84]:
for rbk in rbks:
    examples.extend(rbk)

In [89]:
with jsonlines.open("./data/mixed.jsonl", mode="w") as writer:
    writer.write(examples)

In [88]:
examples[-1]

{'title': 'the birth of tragedy; or, hellenism and pessimism',
 'author': 'nietzsche',
 'school': 'nietzsche',
 'sentence': 'naught that is, is to be deducted, naught is dispensable; the phases of existence rejected by the christians and other nihilists are even of an infinitely higher order in the hierarchy of values than that which the instinct of decadence sanctions, yea durst _sanction._ to comprehend this _courage_ is needed, and, as a condition thereof, a surplus of _strength_: for precisely in degree as courage _dares_ to thrust forward, precisely according to the measure of strength, does one approach truth.'}