In [22]:
import pandas as pd
from pycirclize import Circos
from pycirclize.parser import Matrix

import json
from pathlib import Path

import sys

sys.path.append("../scripts")

from shared import IVY_COLORS
from lcc import LCC_CLASSIFICATION, LCC_TO_AREA, LCC_TOP_CLASSIFICATION

In [23]:
with open("../resources/yarn_colors.json") as f:
    YARN_COLORS = json.load(f)

In [24]:
CATEGORY_COLORS = {
    "Archaeology, Genealogy, and Biography": "limeade heather",
    "Philosophy, Psychology, Religion": "tarragon",
    "World History": "clover",
    "Language and Literature": "ivy",
    "Pharmacology": "green tea heather",
    "General Care": "edamame",
    "Specialty Care": "grass",
    "Medicine and Public Health": "aurora heather",
    "Applied Arts": "celadon heather",
    "Performing Arts": "spearmint",
    "Architecture": "verdant heather",
    "Fine Arts": "douglas fir",
    "Life Sciences": "safflower",
    "Physical Sciences": "lichen",
    "Applied Sciences and Engineering": "caper",
    "General Science and Mathematics": "forest heather",
    "Information Science and Education": "alfalfa",
    "Geography, Anthropology, Recreation": "macaw",
    "Government and Law": "marina",
    "Business, Economics and Sociology": "shire heather",
}

HEX_COLORS = {
    category: YARN_COLORS[color] for category, color in CATEGORY_COLORS.items()
}
HEX_COLORS.update(IVY_COLORS)

In [None]:
df = pd.read_csv("../data/derived/all_records.csv")
df.head()

In [26]:
link_cmap = []
for partner, color in IVY_COLORS.items():
    link_cmap.append(("Dartmouth College", partner, color))

In [27]:
institutions = [
    "Brown University",
    "Columbia University Libraries",
    "Cornell University",
    "Duke University",
    "Harvard University",
    "Johns Hopkins University",
    "Massachusetts Institute of Technology",
    "Princeton University",
    "Stanford University",
    "University of Pennsylvania",
    "University of Chicago",
    "Yale University",
]

In [28]:
def make_chord(df: pd.DataFrame, order, cmap=None):
    if cmap is None:
        cmap = HEX_COLORS
    matrix = Matrix.parse_fromto_table(df)
    circos = Circos.initialize_from_matrix(
        matrix,
        start=-265,
        end=95,
        space=5,
        r_lim=(93, 100),
        cmap=cmap,
        link_cmap=link_cmap,
        order=order,
        label_kws=dict(
            size=8, color="black", adjust_rotation=True, orientation="vertical"
        ),
        link_kws=dict(ec="black", lw=0.5, direction=1),
    )
    fig = circos.plotfig()
    return fig

## Clean call numbers


In [29]:
df = df.dropna(subset="CallNumber")

df = df[df.CallNumber.apply(lambda x: x[:2].isupper())]

In [30]:
def get_class(lcc: str) -> str:
    return LCC_TOP_CLASSIFICATION.get(lcc[0])


def get_area(lcc: str) -> str:
    return LCC_TO_AREA.get(lcc[0])


def get_subclass(lcc: str) -> str:
    if lcc[:2].isalpha():
        return LCC_CLASSIFICATION.get(lcc[:2])
    if lcc[0].isalpha():
        return LCC_CLASSIFICATION.get(lcc[0])
    return None


df["Area"] = df.CallNumber.apply(get_area)
df["Class"] = df.CallNumber.apply(get_class)
df["Subclass"] = df.CallNumber.apply(get_subclass)
df = df.dropna(subset="Subclass")
df = df[df.Area != "Reference Works"]

In [31]:
# Build categories
areas_to_merge = {
    "Language and Literature": "Humanities",
}
categories = {
    "Medicine": {
        "Medicine and Public Health": [
            {"name": "Public aspects of medicine", "type": "Subclass"},
            {"name": "Medicine (General)", "type": "Subclass"},
        ],
        "Pharmacology": [
            {"name": "Therapeutics. Pharmacology", "type": "Subclass"},
            {"name": "Pharmacy and materia medica", "type": "Subclass"},
        ],
        "Specialty Care": [
            {"name": "Dermatology", "type": "Subclass"},
            {"name": "Gynecology and obstetrics", "type": "Subclass"},
            {"name": "Ophthalmology", "type": "Subclass"},
            {"name": "Otorhinolaryngology", "type": "Subclass"},
            {"name": "Pathology", "type": "Subclass"},
            {"name": "Surgery", "type": "Subclass"},
        ],
        "General Care": [
            {"name": "Dentistry", "type": "Subclass"},
            {"name": "Internal medicine", "type": "Subclass"},
            {"name": "Nursing", "type": "Subclass"},
            {"name": "Pediatrics", "type": "Subclass"},
        ],
    },
    "Music and Arts": {
        "Fine Arts": [
            {"name": "Fine Arts", "type": "Subclass"},
            {"name": "Painting", "type": "Subclass"},
            {"name": "Sculpture", "type": "Subclass"},
            {"name": "Drawing, Design", "type": "Subclass"},
            {"name": "Print Media", "type": "Subclass"},
        ],
        "Architecture": [
            {"name": "Architecture", "type": "Subclass"},
        ],
        "Performing Arts": [
            {"name": "Music and Books on Music", "type": "Class"},
        ],
        "Applied Arts": [
            {"name": "Decorative Arts", "type": "Subclass"},
            {"name": "Arts in general", "type": "Subclass"},
        ],
    },
    "STEM": {
        "Physical Sciences": [
            {"name": "Astronomy", "type": "Subclass"},
            {"name": "Chemistry", "type": "Subclass"},
            {"name": "Geology", "type": "Subclass"},
            {"name": "Physics", "type": "Subclass"},
        ],
        "Life Sciences": [
            {"name": "Botany", "type": "Subclass"},
            {"name": "Human anatomy", "type": "Subclass"},
            {"name": "Natural history. Biology", "type": "Subclass"},
            {"name": "Physiology", "type": "Subclass"},
            {"name": "Microbiology", "type": "Subclass"},
            {"name": "Zoology", "type": "Subclass"},
        ],
        "Applied Sciences and Engineering": [
            {"name": "Technology", "type": "Class"},
            {"name": "Agriculture", "type": "Class"},
            {"name": "Naval Science", "type": "Class"},
            {"name": "Military Science", "type": "Class"},
        ],
        "General Science and Mathematics": [
            {"name": "Science (General)", "type": "Subclass"},
            {"name": "Mathematics", "type": "Subclass"},
        ],
    },
    "Social Sciences": {
        "Information Science and Education": [
            {
                "name": "Bibliography. Library Science. Information Resources (General)",
                "type": "Class",
            },
            {"name": "Education", "type": "Class"},
        ],
        "Government and Law": [
            {"name": "Law", "type": "Class"},
            {"name": "Political Science", "type": "Class"},
        ],
        "Business, Economics and Sociology": [
            {"name": "Social Sciences", "type": "Class"},
        ],
        "Geography, Anthropology, Recreation": [
            {"name": "Geography, Anthropology, Recreation", "type": "Subclass"},
        ],
    },
    "Humanities": {
        "Language and Literature": [
            {"name": "Language and Literature", "type": "Class"},
        ],
        "World History": [
            {"name": "World History", "type": "Class"},
            {
                "name": "History of the Americas",
                "type": "Class",
            },
        ],
        "Philosophy, Psychology, Religion": [],
        "Archaeology, Genealogy, and Biography": [
            {"name": "Auxiliary Sciences of History", "type": "Class"},
        ],
    },
}


for from_area, to_area in areas_to_merge.items():
    df.loc[df.Area == from_area, "Area"] = to_area

for area, mapping in categories.items():
    for new_class, old_items in mapping.items():
        for old_item in old_items:
            df.loc[
                (df.Area == area) & (df[old_item["type"]] == old_item["name"]), "Class"
            ] = new_class

In [32]:
# Break down some very broad classes into subclasses
USE_SUBCLASS = {
    "Humanities": [],
    "Language and Literature": [],
    "Medicine": [],
    "Music and Arts": [],
    "Reference Works:": [],
    "STEM": [],
    "Social Sciences": [],
}

df["GroupingVar"] = df["Class"]
for area, classes in USE_SUBCLASS.items():
    for class_ in classes:
        df.loc[(df.Area == area) & (df.Class == class_), "GroupingVar"] = df.loc[
            (df.Area == area) & (df.Class == class_), "Subclass"
        ]

In [None]:
borrowed = (
    df[(df.To == "Dartmouth College")]
    .groupby(["From", "Area", "GroupingVar"])
    .size()
    .rename("N")
    .reset_index()
    .rename(columns={"GroupingVar": "To"})
)

borrowed

In [None]:
quartiles = borrowed["N"].quantile(q=[0.25, 0.5, 0.75]).to_list()

print(f"{quartiles = }")


def get_quartile_idx(N):
    if N <= quartiles[0]:
        return 1
    if N <= quartiles[1]:
        return 2
    if N <= quartiles[2]:
        return 3
    if N > quartiles[2]:
        return 4


borrowed["Quartile"] = borrowed["N"].apply(get_quartile_idx)

In [None]:
borrowed["Rank"] = (
    borrowed.groupby(["From", "Area"])["N"].rank(ascending=False).astype("int")
)
borrowed

In [None]:
borrowed = borrowed[["From", "Area", "To", "N", "Quartile", "Rank"]].sort_values(
    by=["From", "Area", "N"],
    ascending=True,
)
borrowed

In [None]:
for area, records in borrowed.groupby("Area"):
    order = sorted(records.From.unique(), reverse=True) + sorted(records.To.unique())
    fig = make_chord(records.drop(columns=["Area", "N", "Rank"]), order=order)
    fig.savefig(f"out/chord-borrowed-{area}.png", dpi=300)
    records.to_csv(f"out/borrowed-{area}.csv", index=None)
    fig.suptitle(area)

In [None]:
lent = (
    df[(df.From == "Dartmouth College")]
    .groupby(["To", "Area", "GroupingVar"])
    .size()
    .rename("N")
    .reset_index()
)
lent = lent.rename(columns={"GroupingVar": "From"})
lent = lent[["From", "Area", "To", "N"]]
lent

In [None]:
quartiles = lent["N"].quantile(q=[0.25, 0.5, 0.75]).to_list()

print(f"{quartiles = }")


def get_quartile_idx(N):
    if N <= quartiles[0]:
        return 1
    if N <= quartiles[1]:
        return 2
    if N <= quartiles[2]:
        return 3
    if N > quartiles[2]:
        return 4


lent["Quartile"] = lent["N"].apply(get_quartile_idx)

In [None]:
lent["Rank"] = lent.groupby(["To", "Area"])["N"].rank(ascending=False).astype("int")
lent

In [None]:
lent = lent[["From", "Area", "To", "N", "Quartile", "Rank"]].sort_values(
    by=["To", "Area", "N"],
    ascending=True,
)
lent

In [None]:
for area, records in lent.groupby("Area"):
    order = sorted(records.From.unique()) + sorted(records.To.unique())
    fig = make_chord(
        records.drop(columns=["Area", "N", "Rank"]),
        order=order,
    )
    records.to_csv(f"out/lent-{area}.csv", index=None)
    fig.savefig(f"out/chord-lent-{area}.png", dpi=300)
    fig.suptitle(area)