### Step 1: Install Kaggle
First, install the Kaggle library using pip. Open a Command Prompt or PowerShell and run:

pip install kaggle

### Step 2: Get Your Kaggle API Token

Go to the Kaggle website and log in.
Click on your profile picture in the top right corner and select "My Account".
Scroll down to the "API" section and click on "Create New API Token". 
This will download a file named kaggle.json to your computer.

### Step 3: Set Up the Kaggle API Token on Windows
Place the kaggle.json file in the appropriate directory. 
For Windows, this is usually C:\Users\<Your_Username>\.kaggle. 
If the .kaggle directory does not exist, create it.

In [4]:
import os
import kaggle
import zipfile
import pandas as pd

import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Set up the Kaggle API client
kaggle.api.authenticate()

### Download Road Accidents In France Datasets Using Kaggle API

In [15]:
def download_kaggle_dataset(dataset, file_name, save_path):
    """
    Download a specific file from a Kaggle dataset and unzip it.

    Args:
        dataset (str): The Kaggle dataset identifier (e.g., 'username/dataset-name').
        file_name (str): The name of the file to download from the dataset.
        save_path (str): The directory where the dataset will be saved.

    Returns:
        str: The file path of the downloaded file, or None if the download fails.
    
    Raises:
        Exception: Prints an error message if the download or extraction fails.
    """
    try:
        # Download the dataset
        kaggle.api.dataset_download_file(dataset, file_name, path=save_path)
        print(f"Dataset {file_name} downloaded successfully.")

        # Construct the path to the downloaded zip file
        zip_file_path = os.path.join(save_path, f"{file_name}.zip")

        # Check if the file is a zip file and unzip it
        if os.path.exists(zip_file_path):
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(save_path)
            print(f"Dataset {file_name} unzipped successfully.")
            os.remove(zip_file_path)  # Remove the zip file after extraction
        
        # Construct the file path
        file_path = os.path.join(save_path, file_name)
        
        return file_path
    
    except Exception as e:
        print(f"Error downloading the dataset: {e}")
        return None


In [26]:
dataset = 'ahmedlahlou/accidents-in-france-from-2005-to-2016'  # Dataset identifier
file_names = ['caracteristics.csv','holidays.csv', 'places.csv', 'users.csv', 'vehicles.csv'] # Datasets 
save_path = './src/data/accidents_data'    # Directory to save the files

for file_name in file_names:
    file_path = download_kaggle_dataset(dataset, file_name, save_path)
    if file_path is not None:
        print(f"{file_name} downloaded and saved to {file_path}")
    else:
        print(f"Failed to download {file_name}")


Dataset URL: https://www.kaggle.com/datasets/ahmedlahlou/accidents-in-france-from-2005-to-2016
Dataset caracteristics.csv downloaded successfully.
Dataset caracteristics.csv unzipped successfully.
caracteristics.csv downloaded and saved to ./src/data/accidents_data\caracteristics.csv
Dataset URL: https://www.kaggle.com/datasets/ahmedlahlou/accidents-in-france-from-2005-to-2016
Dataset holidays.csv downloaded successfully.
holidays.csv downloaded and saved to ./src/data/accidents_data\holidays.csv
Dataset URL: https://www.kaggle.com/datasets/ahmedlahlou/accidents-in-france-from-2005-to-2016
Dataset places.csv downloaded successfully.
Dataset places.csv unzipped successfully.
places.csv downloaded and saved to ./src/data/accidents_data\places.csv
Dataset URL: https://www.kaggle.com/datasets/ahmedlahlou/accidents-in-france-from-2005-to-2016
Dataset users.csv downloaded successfully.
Dataset users.csv unzipped successfully.
users.csv downloaded and saved to ./src/data/accidents_data\users.

### Create DataFrame from CSV file

In [7]:

def load_csv_to_dataframe(file_path):
    """
    Load a CSV file into a pandas DataFrame.

    Args:
        file_path (str): The path to the CSV file to be loaded.

    Returns:
        pd.DataFrame or None: 
            A pandas DataFrame containing the data from the CSV file if successful, 
            or None if an error occurs.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        pd.errors.EmptyDataError: If the CSV file is empty.
        pd.errors.ParserError: If the file cannot be parsed as a CSV.
        Exception: Catches any other unexpected errors.
    """
    
    try:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, encoding='ISO-8859-1')
        print(f"DataFrame loaded successfully from {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: The file at {file_path} does not exist.")
        return None
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
        return None
    except pd.errors.ParserError:
        print("Error: The file could not be parsed.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [8]:
# path to datasets
file_path_caracteristics = 'src/data/accidents_data/caracteristics.csv'  
file_path_holidays = 'src/data/accidents_data/holidays.csv'  
file_path_places = 'src/data/accidents_data/places.csv'  
file_path_users = 'src/data/accidents_data/users.csv'  
file_path_vehicles = 'src/data/accidents_data/vehicles.csv'  

# create dataframe from csv files
caracteristics_df = load_csv_to_dataframe(file_path_caracteristics)
holidays_df = load_csv_to_dataframe(file_path_holidays)
places_df = load_csv_to_dataframe(file_path_places)
users_df = load_csv_to_dataframe(file_path_users)
vehicles_df = load_csv_to_dataframe(file_path_vehicles)

DataFrame loaded successfully from src/data/accidents_data/caracteristics.csv
DataFrame loaded successfully from src/data/accidents_data/holidays.csv
DataFrame loaded successfully from src/data/accidents_data/places.csv
DataFrame loaded successfully from src/data/accidents_data/users.csv
DataFrame loaded successfully from src/data/accidents_data/vehicles.csv


In [21]:
caracteristics_df.rename(columns={
    'Num_Acc': 'Accident_ID', # Accident_ID: A clearer name for the unique identifier of the accident.
    'an': 'Year',# Year: Year of the accident.
    'mois': 'Month',# Month: Month of the accident.
    'jour': 'Day',# Day: Day of the accident.
    'hrmn': 'Time',# Time: Time of the accident in hours and minutes.
    'lum': 'Lighting_Conditions',# Lighting_Conditions: Lighting conditions at the time of the accident (e.g., daylight, night with street lighting).
    'agg': 'Area_Type',# Area_Type: Type of area (urban or rural) where the accident occurred.
    'int': 'Intersection_Type', # Intersection_Type: The type of intersection where the accident took place.
    'atm': 'Weather_Conditions', # Weather_Conditions: Weather conditions during the accident (e.g., clear, rainy, foggy).
    'col': 'Collision_Type',  # Collision_Type: Type of collision (e.g., frontal, rear-end).
    'com': 'Commune_Code',  # Commune_Code: Geographical code for the commune or municipality where the accident occurred.
    'adr': 'Address',  # Address: Description or name of the location where the accident occurred.
    'gps': 'GPS_Indicator', # GPS_Indicator: Indicator of whether GPS data is available for the accident location.
    'lat': 'Latitude', # Latitude: Latitude of the accident location (geographical coordinate).
    'long': 'Longitude', # Longitude: Longitude of the accident location (geographical coordinate).
    'dep': 'Department_Code' # Department_Code: Code for the department or region where the accident occurred.
}, inplace=True)


In [23]:
holidays_df.rename(columns={
    'ds': 'Date',  # Date: This typically stands for "date" (object type). It represents the date of the observation or event. Usually in formats like YYYY-MM-DD.
    'holiday': 'Holiday_Name'  # Holiday_Name: This column indicates whether a specific date is a holiday. It contains the name or identifier of the holiday (e.g., "Christmas", "New Year's Day").
}, inplace=True)


In [25]:
places_df.rename(columns={
    'Num_Acc': 'Accident_ID',  # Accident_ID: Accident number or unique identifier for the accident (integer).
    'catr': 'Road_Category',  # Road_Category: Category of the road where the accident occurred (float, likely coded; e.g., highway, urban road).
    'voie': 'Road_Number',  # Road_Number: Road number or identifier, possibly the road section or lane (object, string).
    'v1': 'Variable_1',  # Variable_1: Could be a speed limit or vehicle-related variable (float; requires specific context for precise meaning).
    'v2': 'Variable_2',  # Variable_2: Likely a secondary vehicle-related variable, possibly another vehicle’s identifier or type (object).
    'circ': 'Circulation_Type',  # Circulation_Type: Road circulation type, possibly describing the type of traffic circulation (float; e.g., one-way or two-way).
    'nbv': 'Number_of_Lanes',  # Number_of_Lanes: Number of lanes on the road (float).
    'pr': 'Kilometric_Point',  # Kilometric_Point: Kilometric point on the road, indicating a reference point where the accident occurred (float).
    'pr1': 'Kilometric_Point_1',  # Kilometric_Point_1: Another kilometric point related to the accident (float; might refer to a more specific position).
    'vosp': 'Traffic_Lane_Type',  # Traffic_Lane_Type: Type of specific traffic lane (e.g., dedicated lane like a bus or cycle lane) (float, coded).
    'prof': 'Road_Profile',  # Road_Profile: Road profile, possibly describing road elevation or gradient (float; e.g., flat, hill, slope).
    'plan': 'Road_Curvature',  # Road_Curvature: Road plan, referring to the curvature of the road (float; e.g., straight, curved).
    'lartpc': 'Main_Carriageway_Width',  # Main_Carriageway_Width: Width of the road's main carriageway (float).
    'larrout': 'Total_Road_Width',  # Total_Road_Width: Total width of the road (float).
    'surf': 'Surface_Condition',  # Surface_Condition: Condition of the road surface (e.g., wet, dry, icy) (float, coded).
    'infra': 'Infrastructure_Type',  # Infrastructure_Type: Type of infrastructure near the accident, like bridges or tunnels (float, coded).
    'situ': 'Accident_Situation',  # Accident_Situation: Situation of the accident on the road, e.g., at an intersection or crossing (float, coded).
    'env1': 'Environmental_Factors'  # Environmental_Factors: Environmental factors that could have influenced the accident (float, coded).
}, inplace=True)


In [27]:
users_df.rename(columns={
    'Num_Acc': 'Accident_ID',  # Accident_ID: Accident number or unique identifier (integer), linking this data to the broader accident dataset.
    'place': 'Position_In_Vehicle',  # Position_In_Vehicle: Likely refers to the position of the individual in the vehicle (float), possibly coded (e.g., driver, front seat, rear seat).
    'catu': 'User_Category',  # User_Category: Category of road user (integer), indicating the role of the individual involved (e.g., driver, passenger, pedestrian).
    'grav': 'Severity',  # Severity: Severity of the injury (integer), often coded (e.g., 1 for uninjured, 2 for minor injuries, 3 for severe injuries, 4 for fatalities).
    'sexe': 'Gender',  # Gender: Gender of the individual (integer, likely coded as 1 for male and 2 for female).
    'trajet': 'Journey_Type',  # Journey_Type: Type of journey or purpose of the trip (float, possibly coded; e.g., commuting, leisure, work-related trip).
    'secu': 'Safety_Equipment',  # Safety_Equipment: Safety equipment used (float, possibly coded for seatbelt usage, helmet, etc.).
    'locp': 'Pedestrian_Location',  # Pedestrian_Location: Location of the pedestrian (float), if relevant (likely coded; e.g., on a crosswalk, on the side of the road).
    'actp': 'Pedestrian_Activity',  # Pedestrian_Activity: Activity of the pedestrian (float), likely coded (e.g., walking, running, standing still, crossing).
    'etatp': 'Pedestrian_Condition',  # Pedestrian_Condition: Condition or state of the pedestrian (float), possibly indicating impairment (e.g., under the influence, distracted, etc.).
    'an_nais': 'Year_of_Birth',  # Year_of_Birth: Year of birth (float), indicating the age of the individual.
    'num_veh': 'Vehicle_ID'  # Vehicle_ID: Vehicle number or identifier (object), linking the individual to a specific vehicle involved in the accident.
}, inplace=True)


In [29]:
vehicles_df.rename(columns={
    'Num_Acc': 'Accident_ID',  # Accident_ID: Accident number or unique identifier (integer), linking this data to the broader accident dataset.
    'senc': 'Travel_Direction',  # Travel_Direction: Direction of travel or driving direction (float, likely coded; e.g., northbound, southbound).
    'catv': 'Vehicle_Category',  # Vehicle_Category: Category of the vehicle (integer, coded; e.g., car, motorcycle, bicycle, truck, bus).
    'occutc': 'Number_of_Occupants',  # Number_of_Occupants: Number of occupants in the vehicle (integer, total number of people inside the vehicle during the accident).
    'obs': 'Obstacle_Hit',  # Obstacle_Hit: Obstacle hit by the vehicle (float, coded; e.g., no obstacle, obstacle on the road, obstacle off the road).
    'obsm': 'Mobile_Obstacle',  # Mobile_Obstacle: Mobile obstacle (float, possibly coded to indicate a moving obstacle like another vehicle, animal, etc.).
    'choc': 'Collision_Type',  # Collision_Type: Type of collision or point of impact (float, coded; e.g., frontal impact, side impact, rear-end collision).
    'manv': 'Vehicle_Manoeuvre',  # Vehicle_Manoeuvre: Vehicle maneuver before the accident (float, coded; e.g., overtaking, turning, stopping, parking).
    'num_veh': 'Vehicle_ID'  # Vehicle_ID: Vehicle identifier (object), a unique identifier for the vehicle involved in the accident.
}, inplace=True)
