Importing and merging Seshat variables.

# Prep

In [1]:
!git clone https://github.com/Seshat-Global-History-Databank/seshat_api

Cloning into 'seshat_api'...
remote: Enumerating objects: 900, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (130/130), done.[K
remote: Total 900 (delta 118), reused 120 (delta 64), pack-reused 705 (from 1)[K
Receiving objects: 100% (900/900), 1.47 MiB | 132.00 KiB/s, done.
Resolving deltas: 100% (560/560), done.
Updating files: 100% (48/48), done.


In [7]:
import ipykernel
import pandas as pd
import matplotlib
import scipy
import ollama
from seshat_api import SeshatAPI
from seshat_api.sc import PolityTerritories, PolityPopulations, SettlementHierarchies, AdministrativeLevels, ReligiousLevels, MilitaryLevels, FullTimeBureaucrats, MeritPromotions, SpecializedGovernmentBuildings, IrrigationSystems, DrinkingWaterSupplies, Markets, FoodStorageSites, Roads
from seshat_api.general import PolityDurations, PolityPeakYears, PolityDegreeOfCentralizations
from seshat_api.wf import *
from functools import reduce
import json

In [4]:
client = SeshatAPI(base_url="https://seshat-db.com/api")

# Import Variables

In [10]:
# short df names
concentration_dict = {
    "territory": PolityTerritories,
    "pop": PolityPopulations,
    "settlementlvl": SettlementHierarchies,
    "adminlvl": AdministrativeLevels,
    "religionlvl": ReligiousLevels,
    "miltlvl": MilitaryLevels,
    "ftburc": FullTimeBureaucrats,
    "meritpromotion": MeritPromotions,
    "govbuilding": SpecializedGovernmentBuildings,
    "govirrigation": IrrigationSystems,
    "govwater": DrinkingWaterSupplies,
    "govmarket": Markets,
    "govfood": FoodStorageSites,
    "govroad": Roads,
    "polityduration": PolityDurations,
    "politypeak": PolityPeakYears,
    "politycntr": PolityDegreeOfCentralizations,
}

In [13]:
# function to get variables from seshat and save to dfs with short names
def create_df(var_dict, client):
    for var_name, var in var_dict.items():
        df = pd.DataFrame(var(client).get_all())
        globals()[var_name + "_df"] = df
        print(f"{var_name}_df")

In [16]:
create_df(concentration_dict, client)

territory_df
pop_df
settlementlvl_df
adminlvl_df
religionlvl_df
miltlvl_df
ftburc_df
meritpromotion_df
govbuilding_df
govirrigation_df
govwater_df
govmarket_df
govfood_df
govroad_df
polityduration_df
politypeak_df
politycntr_df


# Merge Variables

In [22]:
# need to add prefixes since many columns (e.g., description) have the same name
concentration_prefixed = {
    "tr": territory_df,
    "pop": pop_df,
    "stl": settlementlvl_df,
    "adm": adminlvl_df,
    "rlg": religionlvl_df,
    "milt": miltlvl_df,
    "bur": ftburc_df,
    "mrt": meritpromotion_df,
    "bldg": govbuilding_df,
    "irrig": govirrigation_df,
    "wtr": govwater_df,
    "mrkt": govmarket_df,
    "food": govfood_df,
    "road": govroad_df,
    "dur": polityduration_df,
    "peak": politypeak_df,
    "cntr": politycntr_df
}
for prefix, df in concentration_prefixed.items():
    df.columns = [col if col == "polity" else f"{prefix}_{col}" for col in df.columns]

In [25]:
for name, df in concentration_prefixed.items():
    df.to_csv(f"{name}_df.csv", index=False)

In [28]:
# polity column is json encoded so i need to freeze it before merging
for df in concentration_prefixed.values():
    df["polity"] = df["polity"].apply(lambda x: json.dumps(x, sort_keys=True)) # sort them just in case

In [34]:
# outer join. in some dfs there are multiple values for polity (for example, territory has multiple values depending on the year)
concentration_df = reduce(lambda  left,right: pd.merge(left,right,on=['polity'],
                                            how='outer'), concentration_prefixed.values())
print(concentration_df.columns.tolist()) 

# reminder: cross-check some rows with original dfs to verify merge was okay, speciall for polity territory

['tr_id', 'polity', 'tr_year_from', 'tr_year_to', 'tr_tag', 'tr_is_disputed', 'tr_is_uncertain', 'tr_name', 'tr_polity_territory_from', 'tr_polity_territory_to', 'tr_comment', 'tr_description', 'pop_id', 'pop_year_from', 'pop_year_to', 'pop_tag', 'pop_is_disputed', 'pop_is_uncertain', 'pop_name', 'pop_polity_population_from', 'pop_polity_population_to', 'pop_comment', 'pop_description', 'stl_id', 'stl_year_from', 'stl_year_to', 'stl_tag', 'stl_is_disputed', 'stl_is_uncertain', 'stl_name', 'stl_settlement_hierarchy_from', 'stl_settlement_hierarchy_to', 'stl_comment', 'stl_description', 'adm_id', 'adm_year_from', 'adm_year_to', 'adm_tag', 'adm_is_disputed', 'adm_is_uncertain', 'adm_name', 'adm_administrative_level_from', 'adm_administrative_level_to', 'adm_comment', 'adm_description', 'rlg_id', 'rlg_year_from', 'rlg_year_to', 'rlg_tag', 'rlg_is_disputed', 'rlg_is_uncertain', 'rlg_name', 'rlg_religious_level_from', 'rlg_religious_level_to', 'rlg_comment', 'rlg_description', 'milt_id', 'mi

In [37]:
# convert polity back to json 
concentration_df["polity"] = concentration_df["polity"].apply(json.loads)

In [55]:
# polity json needs to be extracted into individual columns

concenteration_expanded = concentration_df.join(pd.json_normalize(concentration_df["polity"]))
concenteration_expanded

Unnamed: 0,tr_id,polity,tr_year_from,tr_year_to,tr_tag,tr_is_disputed,tr_is_uncertain,tr_name,tr_polity_territory_from,tr_polity_territory_to,...,cntr_is_uncertain,cntr_name,cntr_degree_of_centralization,cntr_comment,cntr_description,end_year,id,long_name,name,start_year
0,,,,,,,,,,,...,,,,,,,,,,
1,711.0,"{'end_year': -10, 'id': 126, 'long_name': 'Ind...",,,TRS,False,False,Polity_territory,105226.0,105226.0,...,False,Polity_degree_of_centralization,nominal,,The two and a half centuries between Diodotus...,-10.0,126.0,Indo-Greek Kingdom,pk_indo_greek_k,-180.0
2,682.0,"{'end_year': -100, 'id': 526, 'long_name': 'Mo...",,,TRS,False,False,Polity_territory,5000.0,5000.0,...,False,Polity_degree_of_centralization,unitary state,,The establishment of a confederation (althoug...,-100.0,526.0,Monte Alban Late I,mx_monte_alban_1_late,-300.0
3,682.0,"{'end_year': -100, 'id': 526, 'long_name': 'Mo...",,,TRS,False,False,Polity_territory,5000.0,5000.0,...,False,Polity_degree_of_centralization,confederated state,,The establishment of a confederation (althoug...,-100.0,526.0,Monte Alban Late I,mx_monte_alban_1_late,-300.0
4,,"{'end_year': -1000, 'id': 448, 'long_name': 'A...",,,,,,,,,...,False,Polity_degree_of_centralization,quasi-polity,,,-1000.0,448.0,Atlantic Complex,fr_atlantic_complex,-2200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,775.0,"{'end_year': 999, 'id': 287, 'long_name': 'Sam...",900.0,900.0,TRS,False,False,Polity_territory,600000.0,600000.0,...,False,Polity_degree_of_centralization,unitary state,,"""Compared to the Tahirids, the Samanids were a...",999.0,287.0,Samanid Empire,uz_samanid_emp,819.0
2520,775.0,"{'end_year': 999, 'id': 287, 'long_name': 'Sam...",900.0,900.0,TRS,False,False,Polity_territory,600000.0,600000.0,...,False,Polity_degree_of_centralization,unitary state,,"""Compared to the Tahirids, the Samanids were a...",999.0,287.0,Samanid Empire,uz_samanid_emp,819.0
2521,776.0,"{'end_year': 999, 'id': 287, 'long_name': 'Sam...",930.0,930.0,TRS,False,False,Polity_territory,2500000.0,2500000.0,...,False,Polity_degree_of_centralization,unitary state,,"""Compared to the Tahirids, the Samanids were a...",999.0,287.0,Samanid Empire,uz_samanid_emp,819.0
2522,776.0,"{'end_year': 999, 'id': 287, 'long_name': 'Sam...",930.0,930.0,TRS,False,False,Polity_territory,2500000.0,2500000.0,...,False,Polity_degree_of_centralization,unitary state,,"""Compared to the Tahirids, the Samanids were a...",999.0,287.0,Samanid Empire,uz_samanid_emp,819.0


In [58]:
concenteration_expanded.to_csv("concentration.csv")

# Add Regions

In [199]:
# polities with region
polity_normal = pd.read_csv("/work/ThesisP1/data/polity_normal.csv") # seshat polities. normalized region jsans (gcolab notebook)

In [None]:
polity_normal.drop(["Unnamed: 0"], axis = 1, inplace=True)

In [208]:
polity_normal.rename(columns = {
    "general_description": "polity_general_description",
    "shapefile_name": "polity_shapefile_name",
    "unreliable_instability_events": "polity_unreliable_instability_events",
    "id_nga": "nga_id",
    "name_nga": "nga_name",
    "subregion": "nga_subregion",
    "longitude": "nga_longitude",
    "latitude": "nga_latitude",
    "capital_city": "nga_capital_city",
    "fao_country": "nga_fao_country",
    "world_region": "nga_world_region",
    "id_region": "seshat_region_id",
    "name_region": "seshat_region_name",
    "subregions_list": "seshat_subregions_list",
    "mac_region": "seshat_mac_region",
}, inplace = True)

In [22]:
polity_normal

Unnamed: 0,id,name,start_year,end_year,long_name,polity_tag,polity_general_description,polity_shapefile_name,polity_unreliable_instability_events,nga_id,...,nga_longitude,nga_latitude,nga_capital_city,nga_code,nga_fao_country,nga_world_region,seshat_region_id,seshat_region_name,seshat_subregions_list,seshat_mac_region
0,132,iq_abbasid_cal_1,750,946,Abbasid Caliphate I,LEGACY,"In 750 CE, following a revolt, Abbasid rulers ...",,False,8.0,...,44.420000,32.470000,Babylon (Hillah),IQ,Iraq,Southwest Asia,62,Mesopotamia,"Iraq, Kuwait",11
1,484,iq_abbasid_cal_2,1191,1258,Abbasid Caliphate II,LEGACY,The Second Abbasid Period (1191-1258 CE) was m...,,False,8.0,...,44.420000,32.470000,Babylon (Hillah),IQ,Iraq,Southwest Asia,62,Mesopotamia,"Iraq, Kuwait",11
2,107,ir_achaemenid_emp,-550,-331,Achaemenid Empire,LEGACY,The Achaemenid Empire was established by Cyrus...,,False,9.0,...,48.235564,32.382851,Susa (Shush),IR,Iran,Southwest Asia,45,Iran,Iran,11
3,637,so_adal_sultanate,1375,1543,Adal Sultanate,POL_AFR_EAST,The Adal Sultanate was one of the earliest Isl...,,False,,...,,,,,,,2,East Africa,"Tanzania, Burundi, Uganda, So Sudan, Somalia, ...",2
4,872,tn_aghlabid_dyn,800,908,Aghlabid Dynasty,OTHER_TAG,,Aghlabid Dynasty,False,,...,,,,,,,3,Maghreb,From Morocco to Libya,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,227,et_zagwe,1137,1269,Zagwe,LEGACY,,,False,,...,,,,,,,2,East Africa,"Tanzania, Burundi, Uganda, So Sudan, Somalia, ...",2
858,231,dz_zayyanid_dyn,1235,1509,Zayyanid Berber Kingdom,LEGACY,,,False,,...,,,,,,,5,Sahel,"Mauritania, Mali, Burkina Faso, Niger, Chad (A...",2
859,222,tn_zirid_dyn,973,1148,Zirids,LEGACY,,,False,,...,,,,,,,3,Maghreb,From Morocco to Libya,2
860,444,mn_zungharian_emp,1670,1757,Zungharian Empire,LEGACY,"The Zungharian polity was, according to Atwood...",,False,24.0,...,102.845486,47.200757,Karakorum,MN,Mongolia,Central Eurasia,9,Mongolia,"Mongolia, Inner Mongolia, the steppe part of M...",3


In [25]:
polity_normal.to_csv("/work/ThesisP1/data/polity_normal.csv")

In [121]:
# merge target vars with polity-region data
power_raw = pd.merge(polity_normal, concenteration_expanded, on=["name", "start_year", "end_year", "long_name", "id"], how = "outer")

In [124]:
power_raw.drop(["polity","Unnamed: 0"], axis = 1, inplace=True)
source_cols = [col for col in power_raw.columns if col.endswith('_name')]
other_cols = [col for col in power_raw.columns if not col.endswith('_name')]
power_raw = power_raw[other_cols + source_cols]

# reminder: some further cleanup needed (merging _name cols?)
# reminder: add geometry column
# reminder: there are 236 rows that have missing values across all target_vars
# reminder: clean variable names

In [15]:
power_raw.to_csv("/work/ThesisP1/data/power_raw.csv", index = False) 

In [18]:
power_raw = pd.read_csv("/work/ThesisP1/data/power_raw.csv")

# Temporal Intervals - West Asia

In [202]:
# west asia polities
polity_normal_wa = polity_normal[polity_normal["seshat_mac_region"] == 11]

# Temporal Intervals

In [208]:
# this block includes modified chatgpt-generated code

# Step 1: Prepare the data
polstart = polity_normal_wa[["start_year", "seshat_region_name"]].dropna().sort_values("start_year")
polstart_periods = [-9600, -4200, -2300, 200, 2000]

# Step 2: Bin start_year into custom periods
categories = pd.cut(polstart["start_year"], bins=polstart_periods, right=True)
polstart["period"] = categories

interval_sum = {}

for (region, period), group in polstart.groupby(["seshat_region_name", "period"]):
    years = group["start_year"].sort_values()
    intervals = years.diff().dropna()
    polity_count = len(years)

    if len(intervals) > 0:
        mean_interval = round(intervals.mean())
        note = f"{polity_count} ({mean_interval})"
    else:
        only_year = int(years.iloc[0])
        note = f"One polity in {only_year}"

    interval_sum[(region, period)] = {
        'mean': mean_interval if len(intervals) > 0 else None,
        'polity_count': polity_count,
        'Polity Count (Average Interval)': note
    }

# Create summary DataFrame
interval_sum_df = pd.DataFrame.from_dict(interval_sum, orient='index')
interval_sum_df.index.names = ['Seshat Region', 'Period']
interval_sum_df.reset_index(inplace=True)

interval_sum_df

Unnamed: 0,Seshat Region,Period,mean,polity_count,Polity Count (Average Interval)
0,Anatolia-Caucasus,"(-9600, -4200]",500.0,4,4 (500)
1,Anatolia-Caucasus,"(-4200, -2300]",,1,One polity in -3000
2,Anatolia-Caucasus,"(-2300, 200]",151.0,13,13 (151)
3,Anatolia-Caucasus,"(200, 2000]",78.0,22,22 (78)
4,Arabia,"(-4200, -2300]",,1,One polity in -3500
5,Arabia,"(-2300, 200]",363.0,4,4 (363)
6,Arabia,"(200, 2000]",114.0,14,14 (114)
7,Iran,"(-9600, -4200]",500.0,8,8 (500)
8,Iran,"(-4200, -2300]",562.0,3,3 (562)
9,Iran,"(-2300, 200]",129.0,17,17 (129)


In [217]:
interval_clean = interval_sum_df.pivot(index='Seshat Region', columns='Period', values='Polity Count (Average Interval)')
interval_clean = interval_clean.fillna('No polity')
period_names = {
    "(-9600, -4200]": "[9600 BCE, 4200 BCE)",
    "(-4200, -2300]": "(4200 BCE, 2300 BCE]",
    "(-2300, 200]": "(2300 BCE, 200 CE]",
    "(200, 2000]": "(200 CE, 2000 CE]"
}
interval_clean = interval_clean.rename(columns=period_names)
interval_clean.reset_index(drop=True, inplace=True)

In [220]:
interval_clean.to_csv("interval_clean.csv", index = False)

In [None]:
interval_clean = pd.read_csv("interval_clean.csv")

In [301]:
# this block includes modified chatgpt-generated code

styled_table = interval_clean.style.hide(axis='index').set_table_styles([
    {'selector': 'table', 'props': [
        ('border', 'none'),
        ('border-collapse', 'collapse')
    ]},
    {'selector': 'caption', 'props': [
        ('font-family', '"Times New Roman", Times, serif'),
        ('font-size', '16px'),
        ('font-style', 'italic'),
        ('text-align', 'left'),
        ('padding', '8px 0')
    ]},
    {'selector': 'thead th', 'props': [
        ('font-family', '"Times New Roman", Times, serif'),
        ('font-size', '16px'),
        ('font-weight', 'bold'),
        ('text-align', 'center'),
        ('border-bottom', '1px solid lightgray'),
        ('background-color', 'white'),
        ('padding', '8')
    ]},
    {'selector': 'thead th:first-child', 'props': [
        ('text-align', 'left')
    ]},
    {'selector': 'tbody td', 'props': [
        ('font-family', '"Times New Roman", Times, serif'),
        ('font-size', '16px'),
        ('padding', '8'),
        ('text-align', 'center'),
        ('border', 'none'),
        ('background-color', 'white')
    ]},
    {'selector': 'tbody td:first-child', 'props': [
        ('text-align', 'left')
    ]}
], overwrite=True).set_caption("Polity Count and Average Interval (in years) between Polities, per Period")

styled_table


Seshat Region,"[9600 BCE, 4200 BCE)","(4200 BCE, 2300 BCE]","(2300 BCE, 200 CE]","(200 CE, 2000 CE]"
Anatolia-Caucasus,500 (4),One polity in -3000,151 (13),78 (22)
Arabia,No polity,One polity in -3500,363 (4),114 (14)
Iran,500 (8),562 (3),129 (17),172 (11)
Levant,No polity,No polity,372 (6),231 (4)
Mesopotamia,3500 (2),1100 (2),164 (11),604 (3)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5d7c084c-ccac-4a11-8892-8e2f6a069ef0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>