In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [None]:
## Checking the WebSite structure
url ='https://en.wikipedia.org/wiki/List_of_Formula_One_fatalities'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}

page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser') 
print(soup)

In [None]:
## Finding the correct table
table = soup.find('table', class_ = 'wikitable plainrowheaders sortable')
print(table)

In [None]:
## Finding the correct headers
f_headers = table.find_all('th')
f_headers

In [None]:
## Selecting the headers for the table "Fatalities By Name" and converting it into a list
f_headers = table.find('tr').find_all('th')
f_table_headers = [header.text.strip() for header in f_headers]
print(f_table_headers)

In [None]:
## Converting the list into a DataFrame
df = pd.DataFrame(columns = f_table_headers)
df

In [None]:
## Creating the loops to add the data from the WebSite into the DataFrame
rows = table.find_all('tr')[1:] # Excluding the first line since it contains the headers
for row in rows:
    header_data = row.find('th')
    row_data = row.find_all('td')

    individual_row_data = [header_data.text.strip()] if header_data else []
    individual_row_data += [data.text.strip() for data in row_data]

    # Handling extra and missing values
    while len(individual_row_data) < len(f_table_headers):
        individual_row_data.append('') 
    
    while len(individual_row_data) > len(f_table_headers):
        individual_row_data = individual_row_data[:len(f_table_headers)]

    df.loc[len(df)] = individual_row_data

In [None]:
pd.set_option("display.max_rows",None)
df.count()

In [None]:
df["Driver"].head()

In [None]:
## Adjusting Drivers Names
df["Driver"] = df["Driver"].apply(lambda x: re.sub(r"[\(\[].*?[\)\]]", "", x).strip())
df.head(10)

In [None]:
# Importing the main Dataset to include the new information
f1_df = pd.read_csv('F1DataExploration.csv')

In [None]:
f1_df.head()

In [None]:
## Adjusting the dates to compare them after
df['Date of accident'] = pd.to_datetime(df['Date of accident'], errors='coerce')
f1_df['driverDeath'] = pd.to_datetime(f1_df['driverDeath'], errors='coerce')

df['Year of Accident'] = df['Date of accident'].dt.year
f1_df['Year of Death'] = f1_df['driverDeath'].dt.year


In [None]:
f1_df['fatalAccident'] = ""

In [None]:
# Bringing the accidents information to the Dataset
fatal_crashes = set(zip(df['Driver'], df['Year of Accident']))

f1_df['fatalAccident'] = f1_df.apply(
    lambda row: "fatalAccident "+row['driverName'] if (row['driverName'], row['year']) in fatal_crashes else "",
    axis=1
)

f1_df.head()

In [None]:
#Cheking...
filtered_df1 = f1_df[(f1_df['driverName'] == 'Ayrton Senna')]
filtered_df1[['raceId','year', 'date','GrandPrixName', 'driverDeath','driverName', 'fatalAccident']].sort_values(by='fatalAccident', ascending=False).head()

In [None]:
filtered_df = f1_df[(f1_df['fatalAccident'] != r'\N') & (f1_df['driverName'] == 'Charles de Tornaco')]

In [None]:
filtered_df[['raceId','year', 'date','GrandPrixName', 'driverName', 'fatalAccident']].sort_values(by='driverName')

In [None]:
f1_df.to_csv('F1Dataset.csv', index=False)