In [61]:
from geopy.geocoders import Nominatim
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd
import re
import time


In [62]:
#Check to see if we loaded our data properly
bike_accidents = pd.read_csv(r"C:\Users\whift\PycharmProjects\Traffice_Collisions\cleaned_bike_accidents.csv")
bike_accidents.head()

Unnamed: 0,report_id,date,person_role,person_injury_lvl,person_veh_type,veh_type,veh_make,veh_model,police_beat,address_no_primary,...,address_pd_intersecting,address_name_intersecting,address_sfx_intersecting,violation_section,violation_type,charge_desc,injured,killed,hit_run_lvl,address
0,20200140,2020-01-06,BICYCLIST,SEVERE,,BICYCLE,GIANNINI,,531,1900.0,...,,,,22350,VC,UNSAFE SPEED (BASIC SPEED LAW) (I),1,0,,"1900 PERSHING DRIVE, San Diego, CA"
1,20200152,2020-01-07,BICYCLIST,PAIN,,BICYCLE,,,521,1400.0,...,,,,22517,VC,UNSAFE OPENING OF VEH DOOR (I),1,0,,"1400 BROADWAY, San Diego, CA"
2,20200153,2020-01-07,BICYCLIST,PAIN,,BICYCLE,,,243,,...,,PADGETT,STREET,21801A,VC,RIGHT-OF-WAY:MAKING LEFT OR U-TURN (I),1,0,,"MIRAMAR ROAD, San Diego, CA"
3,20200237,2020-01-10,BICYCLIST,VISABLE,,BICYCLE,FUJI ROBBT JR.,,523,700.0,...,,,,21657,VC,LANE USAGE:DISOBEY DESIGNATED TRAFFIC DIRECTIO...,1,0,,"700 05TH AVENUE, San Diego, CA"
4,20200338,2020-01-13,BICYCLIST,PAIN,,BICYCLE,GIANNINI,,614,4700.0,...,,,,21801A,VC,RIGHT-OF-WAY:MAKING LEFT OR U-TURN (I),1,0,,"4700 WEST POINT LOMA BOULEVARD, San Diego, CA"


In [63]:
#Preprocessing data we want to include columns we only want and convert columns to the right data frame
focus = bike_accidents[['person_injury_lvl', 'address']].copy()

#Fill in NA values for injury lvl
focus['person_injury_lvl'] = focus['person_injury_lvl'].fillna('Not Reported')

#Map values to Integers 
focus['person_injury_lvl'] = focus['person_injury_lvl'].map({'Not Reported':0,'VISABLE':1,'PAIN':2,'MINOR':3,'SEVERE':4,'SERIOUS':5})
focus['person_injury_lvl'] = focus['person_injury_lvl'].apply(lambda x: 0 if x in [0, 1, 2] else 1 )

#Check if it works
focus['person_injury_lvl'].value_counts(dropna=False)




person_injury_lvl
Low Injury     747
High Injury    117
Name: count, dtype: int64

In [64]:
#Properly Format the Addresses

def clean_address_column(df, column='address'):
    def clean_address(addr):
        if pd.isna(addr):
            return addr

        # Remove leading zeros in ordinal street names 
        addr = re.sub(r'\b0+(\d+)(ST|ND|RD|TH)\b', r'\1\2', addr, flags=re.IGNORECASE)

        # Remove vague directions like "E FEET OF THE CURB LINE OF"
        addr = re.sub(r'\b(?:\d+\s*)?(?:[NEWS]\s+)?(?:FEET|FOOT)\s+OF\s+THE\s+CURB(?:LINE)?\s+OF\s+', '', addr, flags=re.IGNORECASE)
        addr = re.sub(r'\b\d+\s+(?:[A-Z]+\s+)*OF\s+', '', addr, flags=re.IGNORECASE)

        # Remove stray double spaces and trim
        addr = re.sub(r'\s{2,}', ' ', addr).strip()

         #Normalize abbreviations (EXT ST → STREET)
        addr = addr.replace('EXT ST', 'STREET')

        return addr

    
    df[column] = df[column].apply(clean_address)
    return df

format_focus = clean_address_column(focus)


In [65]:
format_focus.head()

Unnamed: 0,person_injury_lvl,address
0,High Injury,"1900 PERSHING DRIVE, San Diego, CA"
1,Low Injury,"1400 BROADWAY, San Diego, CA"
2,Low Injury,"MIRAMAR ROAD, San Diego, CA"
3,Low Injury,"700 5TH AVENUE, San Diego, CA"
4,Low Injury,"4700 WEST POINT LOMA BOULEVARD, San Diego, CA"


In [None]:
#Future fix 


tqdm.pandas()
geolocator = Nominatim(user_agent="geoapi")

cache = {}

def cached_geocode(address):
    if pd.isna(address) or not address.strip():
        return pd.Series([None, None])
    
    if address in cache:
        return cache[address]

    for _ in range(3):  # Retry logic
        try:
            time.sleep(1.5)
            location = geolocator.geocode(address)
            if location:
                result = pd.Series([location.latitude, location.longitude])
                cache[address] = result
                return result
        except:
            continue
    result = pd.Series([None, None])
    cache[address] = result
    return result

# Step 3: Apply geocoding
format_focus[['latitude', 'longitude']] = format_focus['address'].progress_apply(cached_geocode)


 21%|██        | 179/864 [07:37<50:15,  4.40s/it]  

In [None]:
#Now convert addresses into latitude and longitude

tqdm.pandas()  # for progress bar, takes on average 20min to run

cache = {}
geolocator = Nominatim(user_agent="converter")

def cached_geocode(address):
    if address in cache:
        return cache[address]
    try:
        time.sleep(1)
        location = geolocator.geocode(address)
        result = pd.Series([location.latitude, location.longitude]) if location else pd.Series([None, None])
    except:
        result = pd.Series([None, None])
    cache[address] = result
    return result

format_focus[['latitude', 'longitude']] = format_focus['address'].progress_apply(cached_geocode)
format_focus = format_focus.drop(columns=['address'])




In [17]:
format_focus = format_focus.drop(columns=['address'])
format_focus.head()

Unnamed: 0,person_injury_lvl,latitude,longitude
0,4,32.72533,-117.142089
1,2,32.716012,-117.151734
2,2,32.889956,-117.142683
3,1,32.625058,-117.082697
4,2,32.750665,-117.239887


In [None]:
#Check to see if any addresses are not read correctly
format_focus['longitude'].isna().sum()
#There seems to be 41-49(It seems to change every run) addresses that were not read, but for the sake of time we will ignore the 49 addresses
#I created a csv file below that stores the failed addresses for future fixes

In [None]:
#Creates the csv file for addresses that were not read
na_addresses = format_focus[focus['latitude'].isna() & format_focus['longitude'].isna()]
print(na_addresses[['address']])

na_addresses.to_csv('format_failed_geocodes.csv', index=False)



In [20]:
#Drop the NA values
format_focus = format_focus.dropna()
format_focus['longitude'].isna().sum()
#Result should be 0

np.int64(0)

In [58]:
format_focus['person_injury_lvl'].value_counts().sort_values(ascending=False)

person_injury_lvl
1    448
2    201
4     50
3     50
0     20
5      3
Name: count, dtype: int64

In [52]:
#Create Features  and Target Variables
X = format_focus.drop(columns = ['person_injury_lvl'])
y = format_focus['person_injury_lvl']

X_train,X_test,y_train,y_test = train_test_split(X, y,  test_size=0.25, random_state = 42)


In [53]:
#Ml Preprocessing
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

 


In [55]:
smote = SMOTE(random_state=42, k_neighbors = 1)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [56]:
#Hyperparameter Tuning

param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11]}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=2)
grid.fit(X_train, y_train)

best_knn = grid.best_estimator_



In [57]:
#Prediction and evaluate
y_pred = best_knn.predict(X_test)
print(classification_report(y_test,y_pred,zero_division=0))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.63      0.43      0.51       111
           2       0.36      0.37      0.37        51
           3       0.09      0.23      0.13        13
           4       0.00      0.00      0.00        12
           5       0.00      0.00      0.00         1

    accuracy                           0.36       193
   macro avg       0.18      0.17      0.17       193
weighted avg       0.46      0.36      0.40       193

[[ 0  1  3  1  0  0]
 [ 9 48 25 18 10  1]
 [ 3 17 19  8  4  0]
 [ 1  5  1  3  3  0]
 [ 0  4  5  3  0  0]
 [ 0  1  0  0  0  0]]
