# Accidents cargo

## Import des dépendances et du dataset

In [None]:
import pandas as pd
import geopandas as gpd

# Chemin vers le .shp
shp_path = "../data/extracted/Shipping_Accidents/Shipping_Accidents.shp"

# Lecture du shapefile avec geopandas
df = gpd.read_file(shp_path)

## Préparation des données

### Nettoyage des données

1. Les données allant des années 1989 à 2023, nous allons les filtrer et ne garder qu'une intervalle de 20 ans, soit de 2003 à 2023. Cela nous permettra de ne pas avoir des données trop anciennes qui pourraient fausser notre analyse.

2. Le nombre de types d'accidents étant important voir redondant, nous allons les regrouper en 5 catégories :
   - Technical or Equipment Failure
   - Navigation or Maneuvering Incident
   - Fire or Explosion
   - Life-saving Equipment Incident
   - Other

In [None]:
## Définition de l'intervalle de temps
start_year = 2003
end_year = 2023

df = df[(df['Year'] >= start_year) & (df['Year'] <= end_year)]


## Suppression des coordonnées incohérentes
# Suppression des lignes où les coordonnées sont nulles ou égales à 0
df = df[~((df['Longitude'].isnull()) | (df['Latitude'].isnull()) | (df['Longitude'] == 0) | (df['Latitude'] == 0))]


## Regroupement des types d'accidents

# Technical or Equipment Failure
df['Acc_Type'] = df['Acc_Type'].replace('Damage to ship or equipment', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Damages to ships or equipment', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Dammage to ships or equipment', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Door fault . fault in doorways', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('hull failure', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Hull failure/failure of watertight doors/ports etc.', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('machinery damage', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Machinery damage', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Machinery dammage', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('macihnery damage', 'Technical or Equipment Failure')
df['Acc_Type'] = df['Acc_Type'].replace('Technical failure', 'Technical or Equipment Failure')

# Navigation or Maneuvering Incident
df['Acc_Type'] = df['Acc_Type'].replace('Capsizing.listing', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Capsizing/listing', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('collision', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Collision', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('contact', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Contact', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Flooding/Foundering', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('grounding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Grounding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Grounding/stranding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Loss of control', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('stranding.grounding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Stranding.grounding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('stranding/grounding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Stranding/grounding', 'Navigation or Maneuvering Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Tilt / crash', 'Navigation or Maneuvering Incident')

# Fire or Explosion
df['Acc_Type'] = df['Acc_Type'].replace('Fire', 'Fire or Explosion')
df['Acc_Type'] = df['Acc_Type'].replace('Fire . explosion', 'Fire or Explosion')
df['Acc_Type'] = df['Acc_Type'].replace('Fire/Explosion', 'Fire or Explosion')

# Life-saving Equipment Incident
df['Acc_Type'] = df['Acc_Type'].replace('Accidents with life-saving appliances', 'Life-saving Equipment Incident')
df['Acc_Type'] = df['Acc_Type'].replace('Related to the use of rescue equipment', 'Life-saving Equipment Incident')

# Other
df['Acc_Type'] = df['Acc_Type'].replace('n.i.', 'Other')
df['Acc_Type'] = df['Acc_Type'].replace('other', 'Other')
df['Acc_Type'] = df['Acc_Type'].replace("other (unsealing the vessel's hull)", 'Other')
df['Acc_Type'] = df['Acc_Type'].replace('Other reason', 'Other')
df['Acc_Type'] = df['Acc_Type'].replace('Other type', 'Other')
df['Acc_Type'] = df['Acc_Type'].replace('Physical damage', 'Other')
df['Acc_Type'] = df['Acc_Type'].replace('Sunk', 'Other')
df['Acc_Type'] = df['Acc_Type'].replace('v.serious accident', 'Other')
df['Acc_Type'] = df['Acc_Type'].fillna('Other')

### Enrichissement des données

La colonne `Location` n'étant pas toujours renseignée, nous allons la compléter en utilisant deux autres datassets :
- World Port Index – Port Data
- Natural Earth
Ces deux datasets contiennent des informations sur les ports et les côtes du monde entier, ce qui nous permettra par croisement de données de compléter les informations manquantes dans notre dataset.

In [None]:
# Conversion du DataFrame en GeoDataFrame avec CRS standard
df.to_crs("EPSG:4326", inplace=True)
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

# Chargement des shapefiles
lands = gpd.read_file("../data/extracted/Shipping_Accidents/lands.shp")
ports = gpd.read_file("../data/extracted/Shipping_Accidents/ports.shp")

# Forcer le CRS si non défini
for layer in [lands, ports]:
    if layer.crs is None:
        layer.set_crs("EPSG:4326", inplace=True)

# Reprojection en projection métrique (pour calculs de distances)
gdf_proj = gdf.to_crs("EPSG:3857")
ports_proj = ports.to_crs("EPSG:3857")
land_proj = lands.to_crs("EPSG:3857")

# Création des buffers
buffer_distances = {
    "port": 3000,        # 3 km
    "approach": 10000,   # 10 km
    "coast": 20000       # 20 km
}

ports_buffer = gpd.GeoDataFrame(geometry=ports_proj.buffer(buffer_distances["port"]), crs=ports_proj.crs)
ports_approach_buffer = gpd.GeoDataFrame(geometry=ports_proj.buffer(buffer_distances["approach"]), crs=ports_proj.crs)
coast_buffer = gpd.GeoDataFrame(geometry=land_proj.buffer(buffer_distances["coast"]), crs=land_proj.crs)


## Attribution des zones géographiques par jointures spatiales

# a) Zone portuaire
join_port = gdf_proj.sjoin(ports_buffer, how="left", predicate="intersects")
is_port = pd.Series(False, index=gdf_proj.index)  # Série de False par défaut
is_port.loc[join_port.index] = join_port["index_right"].notnull()
gdf_proj["is_port"] = is_port

# b) Zone d’approche portuaire (hors port)
join_approach = gdf_proj[~gdf_proj["is_port"]].sjoin(ports_approach_buffer, how="left", predicate="intersects")
is_approach = pd.Series(False, index=gdf_proj.index)
is_approach.loc[join_approach.index] = join_approach["index_right"].notnull()
gdf_proj["is_port_approach"] = is_approach

# c) Zone côtière (hors port et approche)
mask = (~gdf_proj["is_port"]) & (~gdf_proj["is_port_approach"].fillna(False))
join_coast = gdf_proj[mask].sjoin(coast_buffer, how="left", predicate="intersects")
is_coastal = pd.Series(False, index=gdf_proj.index)
is_coastal.loc[join_coast.index] = join_coast["index_right"].notnull()
gdf_proj["is_coastal"] = is_coastal

# d) Classification finale de l'emplacement
def classify_location(row):
    if row["is_port"]:
        return "Port"
    elif row["is_port_approach"]:
        return "Port approach"
    elif row["is_coastal"]:
        return "Sea"
    else:
        return "Open sea"

gdf_proj["Location"] = gdf_proj.apply(classify_location, axis=1)


## Reprojection finale en WGS84 + mise à jour du DataFrame d’origine

df["Location"] = gdf_proj.to_crs("EPSG:4326")["Location"].values

## Export du dataset préparé pour le dashboard

In [None]:
df.to_csv("../data/cleaned/shipping_accidents_cleaned.csv", index=False)
print("Fichier enregistré : /data/cleaned/shipping_accidents_cleaned.csv")