In [59]:

import pandas as pd
import geopandas as gpd
import geojson
import plotly.io as pio
import os
import json
import re
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import openai
from openai import OpenAI
client = OpenAI(
    api_key='INSERT-YOUR-KEY'
)


In [60]:
bog_mummies=pd.read_csv('bog_mummies.csv', sep=';', header=None)
bog_mummies=pd.DataFrame(bog_mummies.iloc[1:].values,columns=bog_mummies.iloc[0])
bog_mummies=bog_mummies.drop_duplicates('No.')
bog_mummies['Latitude'] = bog_mummies['Latitude'].astype(float)
bog_mummies['Longitude'] = bog_mummies['Longitude'].astype(float)
bog_mummies['Name'] = bog_mummies['Name'].fillna(bog_mummies['Bog '])
bog_mummies['Name'] = bog_mummies['Name'].fillna(bog_mummies['Location (nearest village or city)'])
bog_mummies=bog_mummies.reset_index()


In [63]:
# in which countries are mummies
value_counts_df = bog_mummies.Country.value_counts().reset_index()

fig = go.Figure()

# Add bar plot to the same figure
fig.add_trace(
    go.Bar(x=value_counts_df['index'] , y=value_counts_df['Country'], name='Bog mummies in Nothern Europe',marker_color='#42973A')
)

fig.update_layout(title='Bog bodies in Nothern Europe',
                  xaxis_title='Countries',
                  yaxis_title='Number of bog mummies found',
                  template="plotly_white",
                 )


fig.show()
file_path = 'country_distribution.svg'
pio.write_image(fig, file_path)

In [62]:
bog_mummies.Sex.unique()

array(['unknown', 'male', 'female', 'male ', 'mixed (but mainly men)',
       'mixed (both males and females)',
       'unknown (but possible dominated by males)', 'probably mixed',
       'probably mixed, but at least two males are represented',
       'mixed; both males and females are represented'], dtype=object)

### In the 'Sex' column, several values are to be found, not only female/male/unknown/mixed. Therefore, we use ChatGPT to clean up the data. 


In [12]:
gender=list()
for i in bog_mummies['Sex']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands human gender. Furthermore, you can recognise if several genders are provided. You understand that 'female' and 'male' are two distinctive genders, and 'mixed' is achieved when many women and men are considered. If you are unsure, provide 'unknown' as an answer."""},
            {"role": "user", "content": f""" You are provided an input {i}, output the gender defined in this string in one word. It can be either 'male' or 'female'. If NaN is provided, return NaN. If there are several genders, provide the word 'mixed' as an answer. """}
        ],
        max_tokens=4096,
        temperature=0
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    #print(chatgpt_response_message)
    gender.append(chatgpt_response_message)
bog_mummies['gender']=gender
bog_mummies[['gender','Sex']].values

array([['unknown', 'unknown'],
       ['male', 'male'],
       ['male', 'male'],
       ['male', 'male'],
       ['male', 'male'],
       ['female', 'female'],
       ['male', 'male'],
       ['male', 'male'],
       ['female', 'female'],
       ['male', 'male'],
       ['unknown', 'unknown'],
       ['male', 'male'],
       ['unknown', 'unknown'],
       ['male', 'male'],
       ['female', 'female'],
       ['unknown', 'unknown'],
       ['male', 'male'],
       ['male', 'male'],
       ['male', 'male'],
       ['male', 'male'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['male', 'male'],
       ['female', 'female'],
       ['male', 'male'],
       ['male', 'male'],
       ['male', 'male'],
       ['female', 'female'],
       ['female', 'female'],
       ['unknown', 'unknown'],
       ['female', 'female'],
       ['male', 'male'],
       ['unknown', 'unknown'],
       ['male', 'male'],
       ['male', 'male'],
       ['male', 'male'],
       ['female', 'femal

In [17]:
sex_type_counts=bog_mummies.gender.value_counts().reset_index()
fig = px.pie(sex_type_counts, values='gender', names='index', title='ChatGPT Gender Distribution', color_discrete_sequence=px.colors.sequential.thermal, hole=.3)
fig.show()
file_path = 'sex_distribution.svg'
pio.write_image(fig, file_path)

In [16]:
#double check that it is correct  by manually cleaning up data. 
bog_mummies.loc[bog_mummies.Sex=='male ','Sex']='male'
bog_mummies.loc[bog_mummies.Sex=='mixed (but mainly men)','Sex']='mixed'
bog_mummies.loc[bog_mummies.Sex=='mixed (both males and females)','Sex']='mixed'
bog_mummies.loc[bog_mummies.Sex=='probably mixed','Sex']='mixed'
bog_mummies.loc[bog_mummies.Sex=='probably mixed, but at least two males are represented','Sex']='mixed'
bog_mummies.loc[bog_mummies.Sex=='mixed; both males and females are represented','Sex']='mixed'
bog_mummies.loc[bog_mummies.Sex=='unknown (but possible dominated by males)','Sex']='unknown'

sex_type_counts=bog_mummies.Sex.value_counts().reset_index()
fig = px.pie(sex_type_counts, values='Sex', names='index', title='Gender Distribution', color_discrete_sequence=px.colors.sequential.thermal, hole=.3)
fig.show()
file_path = 'sex_distribution.svg'
pio.write_image(fig, file_path)

In [18]:
age=list()
for i in bog_mummies.Age:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands human age and is able to transform numbers into age categories. If you are unsure, provide 'unknown' as an answer. """},
            {"role": "user", "content": f" You are provided information about human age {i}, transform it into one word, depending on the value: 'mixed','child','adult','old','unknown'. Provide a single word as answer."}
        ],
        max_tokens=4096
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    age.append(chatgpt_response_message)
bog_mummies['age']=age
bog_mummies[['age','Age']].values


array([['adult', 'c. 18'],
       ['adult', 'c. 20-25'],
       ['adult', 'c. 25-35'],
       ['adult', '25-40'],
       ['adult', '25-40'],
       ['adult', '25-35'],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['adult', 'young adult'],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['adult', 'c. 40'],
       ['unknown', 'unknown'],
       ['child', '6-7'],
       ['unknown', "'youngish'"],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['old', "old' "],
       ['adult', 'young adult'],
       ['child', 'c. 12'],
       ['adult', 'middle-aged'],
       ['adult', '45-60'],
       ['adult', '20-30'],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['adult', 'adult '],
       ['adult', '19-22'],
       ['adult', 'adult'],
       ['adult', 'adult'],
       ['adult', 'young adult'],
       ['adult', 'young adult'],
       ['adult', 'young adult'],
       ['adult', '25-38'],
     

In [19]:
bog_mummies['age'].unique()

array(['adult', 'unknown', 'child', 'old', 'mixed'], dtype=object)

In [21]:

age_chatgpt_counts=bog_mummies['age'].value_counts().reset_index()
fig = px.pie(age_chatgpt_counts, values='age', names='index', title='Age Distribution', color_discrete_sequence=px.colors.sequential.thermal, hole=.3)
fig.show()
file_path = 'age_distribution.svg'
pio.write_image(fig, file_path)

In [24]:
find_year=list()
for i in bog_mummies['Find year']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands numbers and years. Furthermore, you can recognise when year range is given instead of a single year. If you are unsure, provide 'unknown' as an answer."""},
            {"role": "user", "content": f" You are provided a string {i}, output the earliest year found in this string. If NaN is provided, return NaN. Provide a single number as answer."}
        ],
        max_tokens=4096
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    find_year.append(chatgpt_response_message)
bog_mummies['earliest_year']=find_year
bog_mummies[['earliest_year','Find year']].values

array([['1998', '1998'],
       ['2011', '2011'],
       ['2005', '2005'],
       ['2003', '2003'],
       ['2003', '2003'],
       ['1978', '1978'],
       ['1821', '1821'],
       ['1953', '1953'],
       ['2005', '2005'],
       ['2006', '2006'],
       ['1952', '1952'],
       ['1929', '1929'],
       ['1989', '1989'],
       ['1950', '1950'],
       ['1845', '1845'],
       ['1954', '1954'],
       ['1824', '1824'],
       ['1946', '1946'],
       ['1827', '1827'],
       ['1939', '1939'],
       ['1929', '1929'],
       ['1936', '1936'],
       ['1969', '1969'],
       ['1960', '1960'],
       ['1955', '1955'],
       ['1962', '1962'],
       ['1838', '1838'],
       ['1840', '1840'],
       ['1881', '1881'],
       ['1878', '1878'],
       ['1880', '1880'],
       ['2012', '2012'],
       ['1983', '1983'],
       ['1987', '1987'],
       ['1972', '1972'],
       ['1958', '1958'],
       ['1967', '1967'],
       ['1984', '1984'],
       ['before 1911', 'before 1911'],
       ['19

In [25]:
bog_mummies['earliest_year'] = pd.to_numeric(bog_mummies['earliest_year'], errors='coerce')

# Calculate histogram bins
counts, bins = np.histogram(bog_mummies['earliest_year'].dropna(), bins=20)
bins = bins[:-1]  # Remove last edge

# Sort counts and corresponding bins
sorted_indices = np.argsort(counts)
sorted_counts = counts[sorted_indices]
sorted_bins = bins[sorted_indices]

# Thermal color scale
thermal_scale = px.colors.sequential.thermal

# Assign a color to each sorted bin
colors = [thermal_scale[int(i / len(sorted_bins) * (len(thermal_scale) - 1))] for i in range(len(sorted_bins))]

# Create a bar chart
fig = px.bar(x=sorted_bins, y=sorted_counts, title='Years of Bog Bodies Discoveries',
             labels={'x': 'Year of Discovery', 'y': 'Number of Discoveries'})

# Apply colors to each bar
fig.update_traces(marker_color=colors)

# Show the plot
fig.show()
file_path = 'earliest_year_distribution.svg'
pio.write_image(fig, file_path)

In [30]:
reason_death=list()
for i in bog_mummies['Assumed cause of death']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands descriptions of human deaths. You are able to synthesize a one-word description from several key words. You understand that if the  death description contains the word 'unknown', it means the cause is unknown. If it contains the word 'violent',  it means the cause is violent, and so on. In cases you are unsure do not ask for more information but provide the word 'unknown' as answer instead."""},
            {"role": "user", "content": f"""You are provided a string {i} describing in a few words cause of death. Generate an output with the reason of death in one word. It can be either 'violent','natural','suicide','accident','mixed' or 'unknown'. """}
        ],
        max_tokens=4096,
        temperature=0
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    #print(chatgpt_response_message)
    reason_death.append(chatgpt_response_message)
bog_mummies['reason_death']=reason_death
bog_mummies[['reason_death','Assumed cause of death']].values
#

array([['unknown', 'unknown'],
       ['violent', 'violent (cutting/stabbing)'],
       ['unknown', 'unknown'],
       ['violent', 'violent (multiple)'],
       ['violent', 'violent (blows to head)'],
       ['unknown', 'unknown'],
       ['suicide', 'violent (hanging/strangulation)'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['suicide', 'suicide'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['suicide', 'suicide'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['unknown', 'unknown'],
       ['suicide', 'suicide'],
       ['suicide', 'suicide'],
       ['violent', 'violent (cutting/stabbing)'],
       ['unknown', 'unknown'],
       ['sui

In [31]:
bog_mummies.reason_death.unique()

array(['unknown', 'violent', 'suicide', 'accident', 'natural', 'mixed',
       'Understood. Please provide the string describing the cause of death, and I will generate the output with the reason of death in one word as per your instructions.',
       'Understood. Please provide the string describing the cause of death, and I will generate the output with the reason of death in one word.'],
      dtype=object)

In [32]:
bog_mummies.loc[bog_mummies['reason_death']=='Understood. Please provide the string describing the cause of death, and I will generate the output with the reason of death in one word.','reason_death']='unknown'
bog_mummies.loc[bog_mummies['reason_death']=='Understood. Please provide the string describing the cause of death, and I will generate the output with the reason of death in one word as per your instructions.','reason_death']='unknown'



In [33]:

reason_death_counts=bog_mummies['reason_death'].value_counts().reset_index()
fig = px.pie(reason_death_counts, values='reason_death', names='index', title='Causes of death', color_discrete_sequence=px.colors.sequential.thermal, hole=.3)
fig.show()
file_path = 'reason_death_distribution.svg'
pio.write_image(fig, file_path)


In [35]:

nr_dead=list()
for i in bog_mummies['Concise description']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands descriptions of human deaths. You are able to understand if several people or a single person were found in one location. You are also able synthesize a one-word description from several key words. """},
            {"role": "user", "content": f"""You are provided a string {i} describing in a few words the finds of bog bodies. Generate an output with the count of how many corpses were found in one location. The answer can be either 'single','several', or 'unknown'. In cases you are unsure do not ask for more information but provide the word 'unknown' as answer instead."""}
        ],
        max_tokens=4096,
        temperature=0
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    nr_dead.append(chatgpt_response_message)
bog_mummies['nr_dead']=nr_dead
bog_mummies[['nr_dead','Concise description']].values


array([['single',
        'In 1998 the legs of a bog mummy were found during an archaeological survey of a peat-milled bog. A small-scale excavation followed. Alongside the body three short lengths of hazel bushwood, a hazel withy and a birch pole were found. The remains belong to a person of about 18 years old, who lay on the left side. The other parts of the body were probably lost during earlier peat cutting activities. No clothes were found; whether the person really was unclothed is uncertain.   '],
       ['single',
        'During peat milling in 2011 the remains of a young adult male, probably aged between 20-25, were found. A small-scale four-day field investigation followed. The milling machine had already removed the head, neck and left arm. The body was lying on the left side with the legs flexed tightly. It had been placed on the bog surface, possibly in a pool, and two hazel stakes marked the place of deposition. These appear to have crossed above the mans head. No clothe

In [37]:
bog_mummies['nr_dead'].unique()

array(['single', 'several', 'unknown'], dtype=object)

In [39]:
nr_dead_counts=bog_mummies['nr_dead'].value_counts().reset_index()
fig = px.pie(nr_dead_counts, values='nr_dead', names='index', title='Number of dead', color_discrete_sequence=px.colors.sequential.thermal, hole=.3)
fig.show()
file_path = 'nr_dead_distribution.svg'
pio.write_image(fig, file_path)

In [54]:
bog_mummies['Radiocarbon Age (BP)'] = pd.to_numeric(bog_mummies['Radiocarbon Age (BP)'], errors='coerce')
bog_mummies['Radiocarbon Age (relativ)']=(bog_mummies['Radiocarbon Age (BP)']-1950)*(-1)

In [57]:
# Calculate histogram bins
counts, bins = np.histogram(bog_mummies['Radiocarbon Age (relativ)'].dropna(), bins=25)
bins = bins[:-1]  # Remove last edge

# Sort counts and corresponding bins
sorted_indices = np.argsort(counts)
sorted_counts = counts[sorted_indices]
sorted_bins = bins[sorted_indices]

# Thermal color scale
thermal_scale = px.colors.sequential.thermal

# Assign a color to each sorted bin
colors = [thermal_scale[int(i / len(sorted_bins) * (len(thermal_scale) - 1))] for i in range(len(sorted_bins))]

# Create a bar chart
fig = px.bar(x=sorted_bins, y=sorted_counts, title='Years of Bog Bodies Deposition',
             labels={'x': 'Year of Deposition', 'y': 'Number of Discoveries'})

# Apply colors to each bar
fig.update_traces(marker_color=colors)

# Show the plot
fig.show()
file_path = 'deposition_year_distribution.svg'
pio.write_image(fig, file_path)