In [None]:
##########################################################################
# Author: Steve Kuruvilla, Shrasth Kumar                                 #
# Description: Main Program                                              #
##########################################################################

import src.filter_dataset as filter_dataset

occurance_dataset = filter_dataset.load_and_filter_dataset('.\\datasets\\OCCURRENCE_PUBLIC.csv', 5000, 'Fre')
injuries_dataset = filter_dataset.load_and_filter_dataset('.\\datasets\\INJURIES_PUBLIC.csv', 10)
train_dataset = filter_dataset.load_and_filter_dataset('.\\datasets\\TRAIN_PUBLIC.csv', 1000, 'Fre')
components_dataset = filter_dataset.load_and_filter_dataset('.\\datasets\\COMPONENTS_PUBLIC.csv', 1000, 'Fre')

occurance_dataset.to_csv('.\\datasets\\filtered\\OCCURRENCE_PUBLIC.csv', index=False)
injuries_dataset.to_csv('.\\datasets\\filtered\\INJURIES_PUBLIC.csv', index=False)
train_dataset.to_csv('.\\datasets\\filtered\\TRAIN_PUBLIC.csv', index=False)
components_dataset.to_csv('.\\datasets\\filtered\\COMPONENTS_PUBLIC.csv', index=False)

import pandas as pd

df = pd.read_csv('.\\datasets\\filtered\\INJURIES_PUBLIC.csv')

# Define categories
categories = {
    "Employee Injuries": ["Offtrain_EmployeeFatal", "Offtrain_EmployeeSerious", "Offtrain_EmployeeMinor"],
    "RS Passenger Injuries": ["Offtrain_RS_PassengerFatal", "Offtrain_RS_PassengerSerious", "Offtrain_RS_PassengerMinor"],
    "Vehicle Operator Injuries": ["Offtrain_VehicleOperatorFatal", "Offtrain_VehicleOperatorSerious", "Offtrain_VehicleOperatorMinor"],
    "Vehicle Passenger Injuries": ["Offtrain_VehiclePassengerFatal", "Offtrain_VehiclePassengerSerious", "Offtrain_VehiclePassengerMinor"],
    "Pedestrian Injuries": ["Offtrain_PedestrianFatal", "Offtrain_PedestrianSerious", "Offtrain_PedestrianMinor"],
    "Trespasser Injuries": ["Offtrain_TrespasserFatal", "Offtrain_TrespasserSerious", "Offtrain_TrespasserMinor"],
}

# Group and aggregate
summary = {}
total_fatalities = 0
total_serious = 0
total_minor = 0

for category, cols in categories.items():
    fatalities = df[cols[0]].sum()
    serious_injuries = df[cols[1]].sum()
    minor_injuries = df[cols[2]].sum()
    total = fatalities + serious_injuries + minor_injuries
    
    # Update total counts
    total_fatalities += fatalities
    total_serious += serious_injuries
    total_minor += minor_injuries

    summary[category] = {
        "Fatalities": fatalities,
        "Serious Injuries": serious_injuries,
        "Minor Injuries": minor_injuries,
        "Total Cases": total,
    }

# Adding total row
summary["Total"] = {
    "Fatalities": total_fatalities,
    "Serious Injuries": total_serious,
    "Minor Injuries": total_minor,
    "Total Cases": total_fatalities + total_serious + total_minor,
}

# Convert to DataFrame
summary_df = pd.DataFrame.from_dict(summary, orient="index")

summary_df.head(30)

Unnamed: 0,Fatalities,Serious Injuries,Minor Injuries,Total Cases
Employee Injuries,122,130,1834,2086
RS Passenger Injuries,26,27,785,838
Vehicle Operator Injuries,1101,842,3299,5242
Vehicle Passenger Injuries,171,272,492,935
Pedestrian Injuries,305,135,113,553
Trespasser Injuries,2145,783,681,3609
Total,3870,2189,7204,13263


In [22]:
import pandas as pd

df = pd.read_csv('.\\datasets\\filtered\\INJURIES_PUBLIC.csv')

# List of all injury columns
fatal_cols = [col for col in df.columns if "Fatal" in col]
serious_cols = [col for col in df.columns if "Serious" in col]
minor_cols = [col for col in df.columns if "Minor" in col]

# Calculate the totals for each row
df["Fatalities"] = df[fatal_cols].sum(axis=1)
df["Serious Injuries"] = df[serious_cols].sum(axis=1)
df["Minor Injuries"] = df[minor_cols].sum(axis=1)
df["Total Cases"] = df["Fatalities"] + df["Serious Injuries"] + df["Minor Injuries"]

# Select required columns and sort by Total Cases in descending order
result_df = df[["OccID", "OccNo", "Fatalities", "Serious Injuries", "Minor Injuries", "Total Cases"]] \
    .sort_values(by="Total Cases", ascending=False)

# Display results

result_df.head(20)

Unnamed: 0,OccID,OccNo,Fatalities,Serious Injuries,Minor Injuries,Total Cases
22604,31774,R86Q0406,0,0,194,194
22627,31751,R86C0490,46,0,142,188
13074,41702,R91H0026,0,8,134,142
13075,41702,R91H0026,0,8,134,142
14806,39945,R90H0627,0,0,98,98
1915,118407,R13D0054,94,0,0,94
18436,36158,R88T2166,0,0,84,84
1888,118587,R13T0192,12,20,40,72
16274,38429,R89V1867,0,0,72,72
1889,118587,R13T0192,12,20,40,72


In [24]:

top_dataset = pd.DataFrame()
top_dataset['OccID'] = result_df['OccID'].head(30).to_string(index=False).split('\n')
top_dataset['OccNo'] = result_df['OccNo'].head(30).to_string(index=False).split('\n')

top_dataset.head(20)

Unnamed: 0,OccID,OccNo
0,31774,R86Q0406
1,31751,R86C0490
2,41702,R91H0026
3,41702,R91H0026
4,39945,R90H0627
5,118407,R13D0054
6,36158,R88T2166
7,118587,R13T0192
8,38429,R89V1867
9,118587,R13T0192


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import load_iris
import time
from tabulate import tabulate

def evaluate_pipelines(X, y):

    title = "Model Performance"

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipelines = [
        ("RandomForest", RandomForestClassifier(), {}), 
        ("LogisticRegression", LogisticRegression(), {}),
        ("SVC", SVC(), {}),
        ("KNeighbors", KNeighborsClassifier(), {}),
        ("DecisionTree", DecisionTreeClassifier(), {})
    ]

    results = []
    best_metric = 0

    for pipeline_name, model, param_grid in pipelines:
        start_time = time.time()

        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        metric = accuracy_score(y_test, y_pred)
        end_time = time.time()
        duration = time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))

        best_metric = max(best_metric, metric)

        results.append({
            "PIPELINE": pipeline_name,
            "DURATION": duration,
            "METRIC": metric,
            "BEST PARAMS":  "DEFAULT" if grid_search.best_params_ == {} else grid_search.best_params_
        })

    df = pd.DataFrame(results)
    table = tabulate(df, headers="keys", tablefmt="grid", floatfmt=".6f", showindex=False)
    title_line = title.center(len(table.splitlines()[0]))
    final_output = title_line + "\n" + table + "\n \n" + f"Best Metric: {best_metric}"
    return final_output

iris = load_iris()
X, y = iris.data, iris.target

table = evaluate_pipelines(X, y)
print(table)

                      Model Performance                       
+--------------------+------------+----------+---------------+
| PIPELINE           | DURATION   |   METRIC | BEST PARAMS   |
| RandomForest       | 00:00:00   | 1.000000 | DEFAULT       |
+--------------------+------------+----------+---------------+
| LogisticRegression | 00:00:00   | 1.000000 | DEFAULT       |
+--------------------+------------+----------+---------------+
| SVC                | 00:00:00   | 1.000000 | DEFAULT       |
+--------------------+------------+----------+---------------+
| KNeighbors         | 00:00:00   | 1.000000 | DEFAULT       |
+--------------------+------------+----------+---------------+
| DecisionTree       | 00:00:00   | 1.000000 | DEFAULT       |
+--------------------+------------+----------+---------------+
 
Best Metric: 1.0
