In [17]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 26 13:51:49 2024
@author: alexfion
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import openai
from openai import OpenAI
import pandas as pd

def extract_year(date_string):
    # This pattern looks for 4 consecutive digits, which likely represent a year
    match = re.search(r'\b\d{4}\b', date_string)
    if match:
        return match.group()
    else: 
        match = re.search(r'\b\d{3}\b', date_string)
        if match:
            return match.group()
        else:
            return None

#URL: https://en.wikipedia.org/wiki/List_of_monarchs_of_the_British_Isles_by_cause_of_death
url = 'URL'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

tables = soup.find_all('table', class_='wikitable')
dfs = []

for table_num, table in enumerate(tables, 1):
    headers = [header.text.strip() for header in table.find_all('th')]
    rows = []
    current_house = None
    house_rows_left = 0

    for row in table.find_all('tr')[1:]:  # Skip the header row
        cells = row.find_all(['td', 'th'])

        if len(cells)==5:
                    
            # Get the first td element
            first_td = row.find('td')
            
            # Create a new empty td element
            new_td = soup.new_tag('td')
            
            # Insert the new td after the first one
            first_td.insert_after(new_td)
            
            # The modified row is now in the soup object
            row = row.td.parent
            cells = row.find_all(['td', 'th'])

        if cells:
            row_data = [''] * len(headers)
            for i, cell in enumerate(cells):
                if i == 1 and cell.has_attr('rowspan'):
                    current_house = cell.text.strip()
                    house_rows_left = int(cell['rowspan']) - 1
                    row_data[i] = current_house
                elif i == 1 and house_rows_left > 0:
                    row_data[i] = current_house
                    house_rows_left -= 1
                else:
                    row_data[i] = cell.text.strip()
            
            rows.append(row_data)

    df = pd.DataFrame(rows, columns=headers)
    df['Table_Number'] = table_num
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df)


                     Name                        House         Born  \
0               Kenneth I    House of Alpin (Scotland)    after 800   
1          Constantine II    House of Alpin (Scotland)   before 879   
2                  Eadred        West Saxons (England)       c. 923   
3     Edgar the Peaceable        West Saxons (England)       c. 943   
4    Ethelred the Unready        West Saxons (England)       c. 968   
..                    ...                          ...          ...   
102             Donald II    House of Alpin (Scotland)                
103              Ælfweard        West Saxons (England)                
104             Athelstan        West Saxons (England)       c. 895   
105         Edwy the Fair        West Saxons (England)       c. 941   
106            Donald III  House of Dunkeld (Scotland)  before 1040   

                  Reign            Death  \
0               843–858  13 February 858   
1               900–943              952   
2              

In [18]:
combined_df.loc[combined_df['Table_Number']==1,'Table_Number']='Natural Causes'
combined_df.loc[combined_df['Table_Number']==2,'Table_Number']='Killed'
combined_df.loc[combined_df['Table_Number']==3,'Table_Number']='Murdered, assassinated, executed or euthanised'
combined_df.loc[combined_df['Table_Number']==4,'Table_Number']='Other'
combined_df.loc[combined_df['Table_Number']==5,'Table_Number']='Accidental death'
combined_df.loc[combined_df['Table_Number']==6,'Table_Number']='Unknown'

#extract years
combined_df['Year'] = combined_df['Death'].apply(extract_year)
combined_df = combined_df.dropna(subset=['Year'])



  combined_df.loc[combined_df['Table_Number']==1,'Table_Number']='Natural Causes'


In [51]:


client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key='OPENAI_API_KEY'
)


In [55]:
gender=list()
for i in combined_df['Name']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands human gender. Furthermore, you can recognise gender from first names. You understand that 'female' and 'male' are two distinctive genders. If you are unsure, provide 'unknown' as an answer."""},
            {"role": "user", "content": f""" You are provided an input {i}, output the gender you identified from the first name of a british monarch in one word. It can be either 'male' or 'female'. If NaN is provided, return NaN.  """}
        ],
        max_tokens=4096,
        temperature=0
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    #print(chatgpt_response_message)
    gender.append(chatgpt_response_message)
combined_df['Gender']=gender
combined_df[['Gender','Name']].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Gender']=gender


array([['male', 'Kenneth I'],
       ['male', 'Constantine II'],
       ['male', 'Eadred'],
       ['male', 'Edgar the Peaceable'],
       ['male', 'Ethelred the Unready'],
       ['male', 'Edmund Ironside'],
       ['male', 'Sweyn Forkbeard'],
       ['male', 'Malcolm II'],
       ['male', 'Canute'],
       ['male', 'Harold Harefoot'],
       ['male', 'Harthacanute'],
       ['male', 'St Edward the Confessor'],
       ['male', 'Edgar'],
       ['male', 'Alexander I'],
       ['male', 'Edgar the Atheling'],
       ['male', 'Henry I'],
       ['male', 'David I'],
       ['male', 'Stephen'],
       ['male', 'Malcolm IV'],
       ['female', 'Matilda (Empress Maud)'],
       ['male', 'Henry II'],
       ['male', 'William I'],
       ['male', 'John "Lackland"'],
       ['male', 'Alexander II'],
       ['male', 'Henry III'],
       ['female', 'Margaret'],
       ['male', 'Edward I "Longshanks"'],
       ['male', 'John'],
       ['male', 'Robert I'],
       ['male', 'Edward Balliol'],
       

In [69]:
death=list()
subdf=combined_df[(combined_df.Table_Number=='Unknown')|(combined_df.Table_Number=='Other')|(combined_df.Table_Number=='Murdered, assassinated, executed or euthanised')]
for i in subdf['Notes']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands human death. Furthermore, you can recognise reasons of death from text."""},
            {"role": "user", "content": f""" You are provided an input {i}, output the cause of death, which can be one of these possibilities: 'Natural Causes', 'Killed','Murdered', 'Assassinated', 'Executed' ,'Euthanised', 'Unknown','Accidental death', in one word. """}
        ],
        max_tokens=4096,
        temperature=0
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    #print(chatgpt_response_message)
    death.append(chatgpt_response_message)
subdf['Table_Number']=death
subdf[['Notes','Table_Number']].values

combined_df.update(subdf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['Table_Number']=death


array([['Killed by his successor, Giric', 'Assassinated'],
       ['Murdered at a party in Pucklechurch by Leofa, an exiled thief',
        'Murdered'],
       ['Killed in internal strife by Cuilén, possibly at Forres.',
        'Killed'],
       ['Killed in Lothian when the hall he was in was burnt to the ground',
        'Killed'],
       ['Killed by Kenneth II', 'Killed'],
       ['Killed at Corfe Castle by his stepmother Ælfthryth or one of her party.  Canonised as Saint Edward the Martyr in 1001.',
        'Murdered'],
       ['Assassinated; tradition states that he was killed at Fettercairn at the instigation of Fionnguala, daughter of Cuncar of Angus.',
        'Assassinated'],
       ['Assassinated and succeeded by Malcolm III', 'Assassinated'],
       ['Killed by Máel Petair of Mearns', 'Killed'],
       ["Supposedly murdered in Berkeley Castle in Gloucestershire after a metal tube (or, in some versions, a sawn-off ram's horn) and a red-hot poker were inserted into his anus. A

In [91]:
dynasty=list()
for i in combined_df['Reign']:
    chatgpt_response = client.chat.completions.create(
        model="gpt-4-1106-preview", 
        messages=[
            {"role": "system", "content": """You are an LLM that understands numbers from strings and can assign numbers to British ruling houses. These are the ruling houses and reign durations: Saxons and Danes (800s to 1066), Normans (1066 to 1154), Plantagenets (1154 to 1485), Tudors (1485 to 1603), Interregnum (1649 to 1660), Stuarts (1603 to 1714), Hanoverians (1714 to 1901), Windsors (1901 to present). If you are unsure, provide 'Other' as an answer."""},
            {"role": "user", "content": f""" You are provided with an input date {i} as a string, output the name of the monarch's house you identified in one word.  """}
        ],
        max_tokens=4096,
        temperature=0
    )
    chatgpt_response_message = chatgpt_response.choices[0].message.content
    #print(chatgpt_response_message)
    dynasty.append(chatgpt_response_message)
combined_df['Dynasty']=dynasty
combined_df[['Dynasty','House']].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Dynasty']=dynasty


array([['Saxons', 'House of Alpin (Scotland)'],
       ['Saxons', 'House of Alpin (Scotland)'],
       ['Saxons', 'West Saxons (England)'],
       ['Saxons', 'West Saxons (England)'],
       ['Saxons', 'West Saxons (England)'],
       ['Saxons', 'West Saxons (England)'],
       ['Saxons', 'Danish Kings (England)'],
       ['Saxons', 'House of Alpin (Scotland)'],
       ['Saxons', 'Danish Kings (England)'],
       ['Saxons', 'Danish Kings (England)'],
       ['Saxons', 'Danish Kings (England)'],
       ['Saxons', 'West Saxon Restoration (England)'],
       ['Normans', 'House of Dunkeld (Scotland)'],
       ['Normans', 'House of Dunkeld (Scotland)'],
       ['Normans', 'West Saxon Restoration (England)'],
       ['Normans', 'The Normans (England)'],
       ['Plantagenets', 'House of Dunkeld (Scotland)'],
       ['Normans', 'House of Blois (England)'],
       ['Plantagenets', 'House of Dunkeld (Scotland)'],
       ['Normans', 'Angevins or Plantagenets (England)'],
       ['Plantagenets', 

In [95]:
combined_df.to_csv('all_monarchs_deaths.csv', index=False)