## Imports

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
from PIL import Image
import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display


from sklearn.model_selection import train_test_split

## Notes
+ Be sure the following are completed if you want to run this program:
    1. Download the dataset and place it in the same level as AutoDriving.ipynb
    2. Extract the images inside the folder AutoDriving\Part 1\images.zip
    3. (Optional) Have a virtual environment ready to install YOLOv8
+ This program will run locally on your CPU or GPU

# Part 1

## DataFrame Initialization

In [2]:
# File Paths
part_1_path = "AutoDriving\Part 1"
images_path = os.path.join(part_1_path, "images")
labels_path = os.path.join(part_1_path, "labels.csv")

In [3]:
df = pd.read_csv(labels_path, header=None)
print("Our current DataFrame upon initialization:", "\n")
df.head(500000000)

Our current DataFrame upon initialization: 



Unnamed: 0,0,1,2,3,4,5
0,0,pickup_truck,213,34,255,50
1,0,car,194,78,273,122
2,0,car,155,27,183,35
3,0,articulated_truck,43,25,109,55
4,0,car,106,32,124,45
...,...,...,...,...,...,...
351544,110590,car,18,57,97,98
351545,110591,articulated_truck,2,71,690,351
351546,110592,pickup_truck,3,240,214,378
351547,110592,car,465,111,507,135


#### The cells are not formatted correctly:
1. The columns are not labeled.
2. The image IDs are reduced down from their original format, and need to be corrected.
3. The class label will need to be encoded.
4. We will need to handle columns 2 - 5 to correctly generate the border-box sizes.

In [4]:
# Label columns
df.columns=["image_id", "class_label", "x1", "y1", "x2", "y2"]
# Correctly reflect image IDs
df['image_id'] = df['image_id'].astype(str).str.zfill(8)
# Handling of columns 2-5
df[['x1','y1','x2','y2']] = df[['x1','y1','x2','y2']].astype(int)

In [5]:
# Ensure that border-boxes are consistent and correct
df = df[(df['x2'] > df['x1']) & (df['y2'] > df['y1'])]

In [6]:
df = df.drop_duplicates()

In [7]:
print(df.dtypes)
df.head()

image_id       object
class_label    object
x1              int32
y1              int32
x2              int32
y2              int32
dtype: object


Unnamed: 0,image_id,class_label,x1,y1,x2,y2
0,0,pickup_truck,213,34,255,50
1,0,car,194,78,273,122
2,0,car,155,27,183,35
3,0,articulated_truck,43,25,109,55
4,0,car,106,32,124,45


## Cleaning Data

In [8]:
# class_label
print("Number of unique labels: ", len(df['class_label'].unique()))
print("Unique labels: ", df['class_label'].unique()[:50])

Number of unique labels:  11
Unique labels:  ['pickup_truck' 'car' 'articulated_truck' 'bus' 'motorized_vehicle'
 'work_van' 'single_unit_truck' 'pedestrian' 'bicycle'
 'non-motorized_vehicle' 'motorcycle']


#### These are our unique class labels that will need to be encoded:

In [9]:
# Check unique classes
class_names = df['class_label'].unique().tolist()
print("Classes:", class_names)

# Map classes to IDs
class_to_id = {class_name: idx for idx, class_name in enumerate(class_names)}
print("Class → ID mapping:", class_to_id)

Classes: ['pickup_truck', 'car', 'articulated_truck', 'bus', 'motorized_vehicle', 'work_van', 'single_unit_truck', 'pedestrian', 'bicycle', 'non-motorized_vehicle', 'motorcycle']
Class → ID mapping: {'pickup_truck': 0, 'car': 1, 'articulated_truck': 2, 'bus': 3, 'motorized_vehicle': 4, 'work_van': 5, 'single_unit_truck': 6, 'pedestrian': 7, 'bicycle': 8, 'non-motorized_vehicle': 9, 'motorcycle': 10}


In [10]:
# image_id
print("Number of unique images in labels.csv: ", len(df['image_id'].unique()))
jpg_files = [f for f in os.listdir(images_path) if f.endswith(".jpg")]
print("Number of unique images in images folder: ", len(jpg_files))

Number of unique images in labels.csv:  110000
Number of unique images in images folder:  5626


#### *Note* that there are more unique images inside of **labels.csv** than our **images**.
#### We will need to make sure that we are using only the rows that have a corresponding **.jpg** file.

In [11]:
jpg_ids = [f.replace(".jpg", "") for f in jpg_files]
df = df[df['image_id'].isin(jpg_ids)]

print("Number of unique images in labels.csv: ", len(df['image_id'].unique()))
print("Number of unique images in images folder: ", len(jpg_files))

Number of unique images in labels.csv:  5626
Number of unique images in images folder:  5626


In [12]:
print(len(df))
df.head(17513)

17967


Unnamed: 0,image_id,class_label,x1,y1,x2,y2
0,00000000,pickup_truck,213,34,255,50
1,00000000,car,194,78,273,122
2,00000000,car,155,27,183,35
3,00000000,articulated_truck,43,25,109,55
4,00000000,car,106,32,124,45
...,...,...,...,...,...,...
17508,00005510,non-motorized_vehicle,1,210,70,372
17509,00005510,car,605,143,640,179
17510,00005511,car,153,250,322,348
17511,00005512,car,289,336,539,479


#### *Note* that:
1. We began with 351,549 unique rows at the beginning of this project.
2. We now have 17,925 unique rows with identified class labels.
3. We have striped **labels.csv** to only match the unique .jpg numbers in our **images** folder.
4. Our highest identified .jpg file is 00005643.jpg, yet our count has a total of 5489 unique .jpgs, meaning some .jpgs are missing as well in this range.

## Gather Training / Validation / Test data

In [13]:
# Unique image ids
image_ids = df['image_id'].unique().tolist()

# Unique class labels
class_map = class_to_id

In [14]:
# Split train/val (80/20)
train_ids, val_ids = train_test_split(image_ids, test_size=0.2, random_state=42)
print("Train images:", len(train_ids))
print("Val images:", len(val_ids))

Train images: 4500
Val images: 1126


In [15]:
def convert_bbox(row, img_width, img_height):
    """
    Converts bounding box annotations from a CSV file into the YOLO format.

    This function normalizes the bounding box coordinates for training object 
    detection models. It also maps the class label to a class ID.

    The CSV file used for the input contains the following columns:
        1. image_id    : The identifier for the image (used for mapping).
        2. class_label : The label of the object in the image (e.g., "car", "pickup_truck").
        3. x1          : The x-coordinate of the top-left corner of the bounding box.
        4. x2          : The x-coordinate of the bottom-right corner of the bounding box.
        5. y1          : The y-coordinate of the top-left corner of the bounding box.
        6. y2          : The y-coordinate of the bottom-right corner of the bounding box.

    Parameters:
    row (object)      : A row from the CSV file containing bounding box annotation data.
    img_width (int)   : The width (in pixels) of the image associated with the bounding box.
    img_height (int)  : The height (in pixels) of the image associated with the bounding box.

    Returns:
    str : A string formatted for YOLO training, containing:
          - class ID (mapped from `class_label`)
          - normalized x_center, y_center, width, height (all normalized by image dimensions).
          
          The returned format is:
          "{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"

    Example:
    >>> row = {'image_id': '00000000', 'class_label': 'pickup_truck', 'x1': 213, 'x2': 34, 'y1': 255, 'y2': 50}
    >>> img_width, img_height = 200, 200
    >>> convert_bbox(row, img_width, img_height)
    '0 0.468000 0.084000 0.084000 0.032000'

    Notes:
    - The `class_map` dictionary must be defined elsewhere in the code and maps class labels to class IDs (e.g., `{'dog': 0, 'cat': 1}`).
    """
    x_center = (row['x1'] + row['x2']) / 2 / img_width
    y_center = (row['y1'] + row['y2']) / 2 / img_height
    width = (row['x2'] - row['x1']) / img_width
    height = (row['y2'] - row['y1']) / img_height
    class_id = class_map[row['class_label']]
    return f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"

In [16]:
# YOLO path building
yolo_base_path = os.path.join(part_1_path, "images_yolo")

# YOLO train/val folders
train_img_path = os.path.join(yolo_base_path, "train", "images")
train_label_path = os.path.join(yolo_base_path, "train", "labels")
val_img_path = os.path.join(yolo_base_path, "val", "images")
val_label_path = os.path.join(yolo_base_path, "val", "labels")

# Create / Delete / Recreate folders when necessary
if os.path.exists(yolo_base_path):
    shutil.rmtree(yolo_base_path)
for path in [train_img_path, train_label_path, val_img_path, val_label_path]:
    os.makedirs(path, exist_ok=True)

In [17]:
def process_images(images, img_dest, label_dest):
    """
    Processes a list of images, generating corresponding YOLO label files and 
    copying the images to a destination folder.

    For each image, the function:
        1. Checks the image file.
        2. Gets the image size (width and height).
        3. Generates a YOLO label file with normalized bounding box data.
        4. Copies the image to the destination folder.

    Parameters:
    images (list)      : A list of image IDs (without file extension) to be processed.
    img_dest (str)     : The destination directory where processed images will be copied.
    label_dest (str)   : The destination directory where YOLO label files will be saved.

    Returns:
    None : This function performs processing on images and saves the output files.
    
    Example:
    >>> process_images(['image_001', 'image_002'], 'output/images', 'output/labels')
    """
    count = 0
    print("[ ", end="", flush=True)
    for image_id in images:
        
        # Checking image
        img_file = os.path.join(images_path, f"{image_id}.jpg")
        if img_file is None:
            continue
        # Get image size
        with Image.open(img_file) as img:
            img_width, img_height = img.size
        # Create YOLO Labels
        label_file = os.path.join(label_dest, f"{image_id}.txt")
        group = df[df['image_id'] == image_id]
        with open(label_file, "w") as f:
            for _, row in group.iterrows():
                f.write(convert_bbox(row, img_width, img_height) + "\n")
        # Copy image to destination folder
        shutil.copy(img_file, os.path.join(img_dest, os.path.basename(img_file)))
        
        count = count + 1
        if count % (len(images)//10) == 0:
            print("*", end="", flush=True)
    print(" ]")

In [18]:
def verify_labels(img_folder, label_folder):
    """
    Verifies that each image in the given image folder has a corresponding label file 
    in the label folder.

    For each image file in the `img_folder`, the function checks if a corresponding 
    label file (with the same name but `.txt` extension) exists in the `label_folder`.
    If any label files are missing, they are reported.

    Parameters:
    img_folder (str)   : The directory containing the image files.
    label_folder (str) : The directory containing the label files.

    Returns:
    None : This function performs a verification process and prints a summary report.

    Example:
    >>> verify_labels('images/', 'labels/')
    """
    missing = []
    count = 0
    print("[ ", end="", flush=True)
    for img_file in os.listdir(img_folder):
        img_id = os.path.splitext(img_file)[0]
        label_file = os.path.join(label_folder, f"{img_id}.txt")
        if not os.path.exists(label_file):
            missing.append(img_file)
        count = count + 1
        if count % (len(os.listdir(img_folder))//10) == 0:
            print("*", end="", flush=True)
    print(" ] - ", end="", flush=True)
    if missing:
        print("Missing labels for images:", missing)
    else:
        print("All images have corresponding labels.")

In [19]:
# Process and Verify train / val sets
process_images(train_ids, train_img_path, train_label_path)
process_images(val_ids, val_img_path, val_label_path)

verify_labels(train_img_path, train_label_path)
verify_labels(val_img_path, val_label_path)

print("YOLO dataset preparation and verification complete!")

[ ********** ]
[ ********** ]
[ ********** ] - All images have corresponding labels.
[ ********** ] - All images have corresponding labels.
YOLO dataset preparation and verification complete!


## Building the Model

In [20]:
#!nvcc --version

In [21]:
#!pip install --upgrade pip
#!pip uninstall -y torch torchvision torchaudio
#!pip cache purge
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [22]:
# imports
import importlib
import subprocess
import sys
from pathlib import Path

package_name = "ultralytics"

# Check / import ultralytics
if importlib.util.find_spec(package_name) is None:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
    print()
    print("Successfully installed! Please restart Kernal.")
from ultralytics import YOLO
import torch

In [23]:
# Load small YOLOv8 model (pre-trained on COCO)
model = YOLO("yolov8s.pt")

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt': 100% ━━━━━━━━━━━━ 21.5MB 4.2MB/s 5.2ss 5.2s<0.0s8.0s


In [24]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    device = '0'
else:
    device = 'cpu'


Torch version: 2.5.1+cu121
CUDA available: True
Device count: 1
GPU: NVIDIA GeForce RTX 2070 SUPER


In [25]:
file_path = "AutoDriving.ipynb"
absolute_path = os.path.abspath(file_path)
yaml_path = os.path.join(part_1_path, r"dataset.yaml")
absolute_path_dirname = os.path.dirname(absolute_path)
absolute_path = os.path.join(absolute_path_dirname, yaml_path)
yolo_path = os.path.join(part_1_path, r"images_yolo")
yolo_path = os.path.join(absolute_path_dirname, yolo_path)

In [26]:
# Dataset content
dataset_content = f"""# dataset.yaml
path: {yolo_path}
train: train/images
val: val/images

nc: 11
names:
  0: pickup_truck
  1: car
  2: articulated_truck
  3: bus
  4: motorized_vehicle
  5: work_van
  6: single_unit_truck
  7: pedestrian
  8: bicycle
  9: non-motorized_vehicle
  10: motorcycle
"""

# Check if the file exists, if not, create it
if not os.path.exists(absolute_path):
    with open(absolute_path, 'w') as file:
        file.write(dataset_content)
    print(f"dataset.yaml has been created with the provided content.")
else:
    print(f"dataset.yaml already exists.")

dataset.yaml has been created with the provided content.


In [27]:
history = model.train(
    data=absolute_path, # NOTE: use absolute path to dataset.yaml file
    epochs=30,
    imgsz=640, # resize images to 
    batch=4,
    name="yolov8_autodriving", # folder to save result
    patience = 20,
    lr0=0.001,
    device=device,
    optimizer="AdamW",
    plots=True, # training plots sent to results folder
)

New https://pypi.org/project/ultralytics/8.3.240 available  Update with 'pip install -U ultralytics'
Ultralytics 8.3.197  Python-3.9.4 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 2070 SUPER, 8192MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=c:\Users\ajaih\OneDrive\Desktop\AutoDriving\AutoDriving\Part 1\dataset.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=30, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.001, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8s.pt, momentum=0.937,

In [28]:
# Evaulate model
metrics = model.val()
print(metrics)

Ultralytics 8.3.197  Python-3.9.4 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 2070 SUPER, 8192MiB)
Model summary (fused): 72 layers, 11,129,841 parameters, 0 gradients, 28.5 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.10.0 ms, read: 221.569.5 MB/s, size: 25.7 KB)
[K[34m[1mval: [0mScanning C:\Users\ajaih\OneDrive\Desktop\AutoDriving\AutoDriving\Part 1\images_yolo\val\labels.cache... 1126 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 1126/1126  0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 282/282 16.4it/s 17.2s0.1s
                   all       1126       3468      0.609       0.66      0.657      0.486
          pickup_truck        300        398       0.73      0.857      0.889      0.733
                   car        937       2308       0.81      0.922      0.929      0.697
     articulated_truck         81         89      0.626      0.854      0.839      0.665
                   bus         91 

In [57]:
# Test model

results = model.predict(r"AutoDriving\Part 1\images\00003682.jpg")
results[0].show()


image 1/1 c:\Users\ajaih\OneDrive\Desktop\AutoDriving\AutoDriving\Part 1\images\00003682.jpg: 448x640 1 car, 1 articulated_truck, 9.6ms
Speed: 2.0ms preprocess, 9.6ms inference, 1.5ms postprocess per image at shape (1, 3, 448, 640)


# Part 2

## Imports

In [30]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

pd.set_option("display.float_format", lambda x: f"{x:,.3f}")

plt.style.use("seaborn-v0_8")
sns.set_palette("Set2")

In [31]:
df = pd.read_csv(r"AutoDriving/Part 2/Tesla - Deaths.csv")

In [32]:
display(df.head(1))

Unnamed: 0,Case #,Year,Date,Country,State,Description,Deaths,Tesla driver,Tesla occupant,Other vehicle,Cyclists/ Peds,TSLA+cycl / peds,Model,Autopilot claimed,Verified Tesla Autopilot Deaths,Verified Tesla Autopilot Deaths + All Deaths Reported to NHTSA SGO,Unnamed: 16,Unnamed: 17,Source,Note,Deceased 1,Deceased 2,Deceased 3,Deceased 4
0,294.0,2022.0,1/17/2023,USA,CA,Tesla crashes into back of semi,1.0,1,-,-,-,1,-,-,-,-,https://web.archive.org/web/20221222203930/ht...,https://web.archive.org/web/20221222203930/ht...,https://web.archive.org/web/20230118162813/ht...,,,,,


In [33]:
print(df.columns)

Index(['Case #', 'Year', 'Date', ' Country ', ' State ', ' Description ',
       ' Deaths ', ' Tesla driver ', ' Tesla occupant ', ' Other vehicle ',
       ' Cyclists/ Peds ', ' TSLA+cycl / peds ', ' Model ',
       ' Autopilot claimed ', ' Verified Tesla Autopilot Deaths ',
       ' Verified Tesla Autopilot Deaths + All Deaths Reported to NHTSA SGO ',
       'Unnamed: 16', 'Unnamed: 17', ' Source ', ' Note ', ' Deceased 1 ',
       ' Deceased 2 ', ' Deceased 3 ', ' Deceased 4 '],
      dtype='object')


In [34]:
states = df[' State '].dropna().str.strip()
states = states[states != '-'].unique().tolist()
print(states)

['CA', 'WA', 'GA', 'MO', 'AR', 'IL', 'FL', 'SC', 'MD', 'NY', 'MA', 'ME', 'UT', 'PA', 'HA', 'AL', 'MI', 'NV', 'OR', 'AZ', 'NJ', 'CO', 'NC', 'ID', 'OH', 'IA', 'VA', 'TX', 'IN', 'DE', 'NH', 'TN', 'HI']


In [35]:
columns = [
    'case', 'year', 'date', 'country', 'country/state',
    'description', 'deaths', 'driver', 'occupants', 'otherVehicles',
    'cpCollisions', 'tcpCollisions', 'model', 'autopilotClaim', 'autopilotDeaths',
    'autopilotDeaths_NHTSASGO', 'source_1', 'source_2', 'source_3', 'note',
    'deceased_1', 'deceased_2', 'deceased_3', 'deceased_4'
]
df.columns = columns

In [36]:
df = df.drop(df.index[294:])
df.shape

(294, 24)

In [37]:
df.replace('-', np.nan, inplace=True)
df.replace(' -', np.nan, inplace=True)
df.replace('- ', np.nan, inplace=True)
df.replace(' - ', np.nan, inplace=True)

In [38]:
df['country/state'].fillna(df['country'],inplace=True)
df['driver'].fillna(0,inplace=True)
df['occupants'].fillna(0,inplace=True)
df['otherVehicles'].fillna(0,inplace=True)
df['cpCollisions'].fillna(0,inplace=True)
df['tcpCollisions'].fillna(0,inplace=True)
df['model'].fillna('Any',inplace=True)
df['autopilotClaim'].fillna(0,inplace=True)
df['autopilotDeaths'].fillna(0,inplace=True)
df['autopilotDeaths_NHTSASGO'].fillna(0,inplace=True)
df['source_1'].fillna(df['source_3'],inplace=True)
df['source_2'].fillna(df['source_3'],inplace=True)
df['note'].fillna("None",inplace=True)
df['deceased_1'].fillna("N/A",inplace=True)
df['deceased_2'].fillna("N/A",inplace=True)
df['deceased_3'].fillna("N/A",inplace=True)
df['deceased_4'].fillna("N/A",inplace=True)

In [39]:
df = df.astype({
    'case': "uint16",
    'year': "uint16",
    'country': "string",
    'country/state': "string",
    'description': "string",
    'deaths': "uint8",
    'driver': "uint8",
    'occupants': "uint8",
    'otherVehicles': "uint8",
    'cpCollisions': "uint8",
    'tcpCollisions': "uint8",
    'model': "string",
    'autopilotClaim': "uint8",
    'autopilotDeaths': "uint8",
    'autopilotDeaths_NHTSASGO': "uint8",
    'note': "string",
    'deceased_1': "string",
    'deceased_2': "string",
    'deceased_3': "string",
    'deceased_4': "string"
})
df.dtypes

case                                uint16
year                                uint16
date                                object
country                     string[python]
country/state               string[python]
description                 string[python]
deaths                               uint8
driver                               uint8
occupants                            uint8
otherVehicles                        uint8
cpCollisions                         uint8
tcpCollisions                        uint8
model                       string[python]
autopilotClaim                       uint8
autopilotDeaths                      uint8
autopilotDeaths_NHTSASGO             uint8
source_1                            object
source_2                            object
source_3                            object
note                        string[python]
deceased_1                  string[python]
deceased_2                  string[python]
deceased_3                  string[python]
deceased_4 

In [40]:
df['date'] = pd.to_datetime(df['date'])

In [41]:
for col in df.columns:
    if df[col].dtype == 'string':
        df[col] = df[col].str.strip()

In [42]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [43]:
columns_reordered = [
    'case', 'year', 'month', 'day', 'date', 'country', 'country/state',
    'description', 'deaths', 'driver', 'occupants', 'otherVehicles',
    'cpCollisions', 'tcpCollisions', 'model', 'autopilotClaim', 'autopilotDeaths',
    'autopilotDeaths_NHTSASGO', 'source_1', 'source_2', 'source_3', 'note',
    'deceased_1', 'deceased_2', 'deceased_3', 'deceased_4'
]
df = df[columns_reordered]

In [44]:
df_sorted = df.sort_values(by='case', ascending=True)
df_sorted.set_index('case', inplace=True)

In [None]:
numeric_columns = [
    "case", "year", "deaths", "driver", "occupants", "otherVehicles", "cpCollisions", "tcpCollisions", "autopilotClaim", "autopilotDeaths", "autopilotDeaths_NHTSASGO"
]

corr = df[numeric_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(
    corr,
    annot=True,
    cmap='coolwarm',
    fmt=".2f",
    vmin=-1,
    vmax=1
);

In [61]:
plt.figure(figsize=(10, 8))
sns.histplot(
    df['date'],
    bins=30
)
plt.title("Count of cases by year")
plt.show();

<Figure size 1000x800 with 2 Axes>

<Figure size 1000x800 with 1 Axes>

In [47]:
plt.figure(figsize=(10, 8))
sns.histplot(
    data=df[ ~df["country"].isin(states) ],
    x='country'
)
plt.title('Count of cases by country')
plt.xticks(rotation=90)
plt.show();

<Figure size 1000x800 with 1 Axes>

In [48]:
plt.figure(figsize=(10, 8))
sns.histplot(
    data=df[ df["country"] == "USA" ],
    x='country/state',
)
plt.xticks(rotation=90)
plt.title("Count of cases by US state")
plt.show();

<Figure size 1000x800 with 1 Axes>

In [49]:
plt.figure(figsize=(10, 8))
sns.histplot(
    data=df,
    x='deaths',
    bins=8,
    kde=True
)
plt.title("Count of deaths per case")
plt.xlabel("Number of deaths per case")
plt.xticks([1,2,3,4])
plt.show();

<Figure size 1000x800 with 1 Axes>

In [50]:
plt.figure(figsize=(10, 8))
sns.countplot(
    data=df,
    x='driver',
)
plt.title('Count of occupant deaths vs. driver deaths')
plt.xticks([0,1],['non-driver death','driver death'])
plt.show();

<Figure size 1000x800 with 1 Axes>

In [51]:
plt.figure(figsize=(10, 8))
sns.histplot(
    data=df,
    x='occupants',
    kde=True
)
plt.title('Count of occupant deaths')
plt.xticks([0,1,2,3])
plt.show();

<Figure size 1000x800 with 1 Axes>

In [52]:
plt.figure(figsize=(10, 8))
sns.histplot(
    data=df,
    x='tcpCollisions',
    kde=True
)
plt.title('Number of collisions with Tesla')
plt.xlabel("Number of cars/pedestrian/cyclists involved in accident")
plt.xticks([0,1,2,3,4])
plt.show();

<Figure size 1000x800 with 1 Axes>

In [53]:
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

sns.histplot(
    data=df[df['driver'] == 0],
    x='tcpCollisions',
    ax=axes[0],
    binwidth=1,
    kde=True
)
axes[0].set_xlabel('Number of cars/pedestrian/cyclists involved in accident')
axes[0].set_xticks([0,1,2,3])
axes[0].set_title('Number of collisions with Tesla (driver was not killed)')

sns.histplot(
    data=df[df['driver'] == 1],
    x='tcpCollisions',
    ax=axes[1],
    binwidth=1,
    kde=True
)
axes[1].set_xlabel('Number of cars/pedestrian/cyclists involved in accident')
axes[1].set_xticks([0,1,2,3,4])
axes[1].set_title('Number of collisions with Tesla (driver was killed)')


plt.tight_layout()
plt.show();

<Figure size 2000x800 with 2 Axes>

In [54]:
plt.figure(figsize=(10, 8))
sns.histplot(
    data=df,
    x='otherVehicles',
    kde=True
)
plt.xlabel("Other vehicles invovlved per accident")
plt.title("Count of vehicles involved per accident")
plt.xticks([0,1,2,3,4])
plt.show();

<Figure size 1000x800 with 1 Axes>