In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import psycopg2
from geopy.distance import distance as geo_distance
import geopandas as gpd
# import shapely
import pdb
from tqdm import tqdm
# import time.strftime
import matplotlib as mpl
import xgboost as xgb
# import seaborn as sns
mpl.rcParams['axes.linewidth'] = 3
pd.set_option('display.max_columns', 500)

# Analysis idea:

Loop over each intersection. Extract the num crashes within a certain radius. Calculate the number of crashes per yea for each unique intersection. Use a decision tree that uses a gini index on just the num-legs, angle data or some other simple model As we gather more quality feature data more sophisticated methods can replace the decision tree. Use the standard deviation of poisson distribution to calculate the confidence interval.

In [None]:
POSTGRES_DB= 'rws'
POSTGRES_PASSWORD= 'ug_password'
POSTGRES_USER= 'ug_username'
CURRENT_DIR= os.getcwd()
    
conn = psycopg2.connect(f"host=localhost dbname={POSTGRES_DB} user={POSTGRES_USER} password={POSTGRES_PASSWORD} port=5433")


In [None]:
sql_full = f"""
SELECT 
  ni.node_id,
  ni.point3857,
  ni.lat,
  ni.long,
  ni.legs,
  int_feat.way_lines,
  int_feat.ramp_roads,
  int_feat.bikepaths,
  int_feat.footways,
  int_feat.oneways,
  int_feat.maxspeed,
  int_feat.surface_types,
  int_feat.aadt,
  int_feat.min_angle,
  int_feat.second_min_angle,
  sum(cast(
  case
        when wfv.lanes is null then '0'
        when wfv.lanes = '' then '0'
        else wfv.lanes
    end
    as int
  )) as num_lanes,
 mode() WITHIN GROUP (ORDER BY wfv.highway) AS  highway_type
from 
  node_intersections ni 
inner join 
  intersection_features int_feat  on int_feat.node_id=ni.node_id 
 join 
  curated_way_feature_view wfv on wfv.node_id=ni.node_id  
where
  int_feat.junction_type=1 and 
  not ni.includes_railway and 
  not ni.includes_bikepath and 
  not ni.includes_footway 
group by 
  ni.node_id,
  ni.point3857,
  ni.lat,
  ni.long,
  ni.legs,
  int_feat.way_lines,
  int_feat.ramp_roads,
  int_feat.bikepaths,
  int_feat.footways,
  int_feat.oneways,
  int_feat.maxspeed,
  int_feat.surface_types,
  int_feat.aadt,
  int_feat.min_angle,
  int_feat.second_min_angle;
"""

df = gpd.read_postgis(sql_full, conn, geom_col="point3857")

In [None]:
df['min_angle'].describe()

In [None]:
# cur = conn.cursor()

sql_crashes = f"""SELECT *,ST_AsText(dc.point) as t_point from crashes.dc_indexed as dc """

# sql_ints = f"""SELECT * from planet_osm_intersections_alpha """

sql_roads = f"""SELECT * from planet_osm_roads"""



# # crashes = cur.fetchall()
# # df_int = pd.read_sql_query(sql_ints, conn)
# # df_crashes = pd.read_sql_query(sql_crashes, conn)
# df_int = gpd.read_postgis(sql_ints, conn, geom_col="point")
df_crashes = gpd.read_postgis(sql_crashes, conn,geom_col="point")
df_roads = gpd.read_postgis(sql_roads, conn, geom_col="way")


# cur.execute(f"""SELECT * , ST_Distance(ST_Transform(ST_SetSRID(ST_MakePoint({x},{y}), 4326),3857), xsect.point) as dist FROM planet_osm_intersections_alpha as xsect 
# WHERE  ST_Distance(ST_Transform(ST_SetSRID(ST_MakePoint({x},{y}), 4326),3857), xsect.point) < {search_radius} 
# ORDER BY dist
#  """)

# The above code pulls data from the postGIS database running in docker

In [None]:
df_crashes.head()

# Format datatypes and define exposure window
The idea is that the crash data is collected from various sources. By looking at the data it seems there are fairly uniform collections from 2009-2022. So the strategy is going to use this time frame as our exposure time (time we are recording data in DC), and assume all intersection crashes are accurately recorded during this time. 

In [None]:
print([x for x in df_crashes.columns])

new_dtypes = {"majorinjuries_bicyclist": int,
              "majorinjuries_driver": int,
              "majorinjuries_pedestrian": int,
              "majorinjuriespassenger": int,
              "fatal_driver": int,
              "fatal_pedestrian": int,
              "fatalpassenger": int,
              "fatal_bicyclist": int,
#               "num_legs": int
             }
df_crashes = df_crashes.astype(new_dtypes)
# dataframe = dataframe.astype(new_dtypes)


df_crashes['reportdate'] =  pd.to_datetime(df_crashes['reportdate'], format='%Y/%m/%d %H:%M:%S+%f')

In [None]:
df_crashes['reportdate'].hist(bins=150)
plt.yscale('log')
plt.show()
print(df_crashes[(df_crashes['reportdate'] > "2009") &(df_crashes['reportdate'] < "2022") ]['reportdate'].min())
print(df_crashes[(df_crashes['reportdate'] > "2009") &(df_crashes['reportdate'] < "2022") ]['reportdate'].max())
print((df_crashes[(df_crashes['reportdate'] > "2009") &(df_crashes['reportdate'] < "2022") ]['reportdate'].max())-\
      (df_crashes[(df_crashes['reportdate'] > "2009") &(df_crashes['reportdate'] < "2022") ]['reportdate'].min()))
print("We will normalize the crashes to an exposure time of 12.8 +- 0.5 years")

df_crashes[(df_crashes['reportdate'] > "2009") &(df_crashes['reportdate'] < "2022") ]['reportdate'].hist(bins=150)
# plt.yscale('log')

In [None]:
exposure_time = 12.8
exposure_time_up = 13.3
exposure_time_down = 12.3
df_crashes = df_crashes[(df_crashes['reportdate'] > "2009") &(df_crashes['reportdate'] < "2022") ]
# severe_columns = [x for x in df_crashes.columns if "FATAL" in x.upper() or "MAJOR" in x.upper()]
# df_crashes_severe = df_crashes[ pd.DataFrame.any(df_crashes[severe_columns].astype(int) > 0,axis=1) ]

In [None]:
# df_crashes_fatal['crash_count'] = 0
major_injury_columns = [x for x in df_crashes.columns if "MAJOR" in x.upper()]
fatal_injury_columns = [x for x in df_crashes.columns if "FATAL" in x.upper()]
# df_crashes[ pd.DataFrame.any(df_crashes[severe_columns].astype(int) > 0,axis=1) ]
major_injury_columns
fatal_injury_columns
# print(df_int.shape)
# print(df_crashes_fatal.shape)
# for i,row in enumerate(df_int.geometry):
#     print(i)
#     df_int.loc[i,'crash_count'] = sum(df_crashes_fatal.geometry.distance(row) < 50)
# severe_columns

# Major calculation section of the notebook - associating crashes to intersections
Loop over intersections and calculate the crash rates for all, severe, and fatal crashes per intersection.

In [None]:
df.head(3)

In [None]:
df['crash_rate'] = 0
df['major_injury_crash_rate'] = 0
df['fatal_crash_rate'] = 0

df['distance_weighted_crash_count'] = 0
df['involvesBike'] = 0
# df_crashes_severe = df_crashes_severe.sample(2500)
# df_int = df_int.sample(2500)


print(df.shape)
# print(df_crashes_severe.shape)
print(df_crashes.shape)
crash_buffer = df_crashes.geometry.buffer(15)

for i,row in tqdm(enumerate(df.geometry)):
#     print(i)
#     df_crashes.geometry.buffer(50)
#     pdb.set_trace()
    buffer_index = crash_buffer.contains(row)
#     dist_vector = df_crashes.geometry.distance(row).astype(float)
#     dist_vector = dist_vector.fillna(1000000)
    df.loc[i,'crash_rate'] = sum(buffer_index) / exposure_time
    df.loc[i,'major_injury_crash_rate'] = len(df_crashes[((buffer_index) & (df_crashes[major_injury_columns].astype(bool).any(axis=1)))]) / exposure_time
    df.loc[i,'fatal_crash_rate'] = len(df_crashes[((buffer_index) & (df_crashes[fatal_injury_columns].astype(bool).any(axis=1)))]) / exposure_time

#     pdb.set_trace()
#     if i > 5:
#         break
#     if sum(dist_vector < 50):
#         pdb.set_trace()
#         df_int.loc[i,'distance_weighted_crash_count'] = sum( (dist_vector < 50).apply(int)*(10/(dist_vector+0.0001)) )
    
#     df_int.loc[i,'isFatal'] = sum( (dist_vector < 50).apply(int)*(df_crashes_severe['fatal_bicyclist'].astype(int)+df_crashes_severe['fatal_driver'].astype(int)+df_crashes_severe['fatal_pedestrian'].astype(int)+df_crashes_severe['fatalpassenger'].astype(int)) )
    
df['crash_rate_exposure_err_up'] = df['crash_rate'] * (exposure_time/exposure_time_up)
df['major_injury_crash_rate_exposure_err_up'] = df['major_injury_crash_rate'] * (exposure_time/exposure_time_up)
df['fatal_crash_rate_exposure_err_up'] = df['fatal_crash_rate'] * (exposure_time/exposure_time_up)

df['crash_rate_stat_err'] = np.sqrt(df['crash_rate']*exposure_time)/exposure_time
df['major_injury_crash_rate_stat_err'] = np.sqrt(df['major_injury_crash_rate']*exposure_time)/exposure_time
df['fatal_crash_rate_stat_err'] = np.sqrt(df['fatal_crash_rate']*exposure_time)/exposure_time
    

In [None]:
# An uncertainty of 0 for poisson statistics is undefined, assume 1 accident over the exposure time: sqrt(1)/12.8
df['crash_rate_stat_err'] = df['crash_rate_stat_err'].apply(lambda x: x if x>0 else 0.08)
df['fatal_crash_rate_stat_err'] = df['fatal_crash_rate_stat_err'].apply(lambda x: x if x>0 else 0.08)
df['major_injury_crash_rate_stat_err'] = df['major_injury_crash_rate_stat_err'].apply(lambda x: x if x>0 else 0.08)

In [None]:
df['nearby_ints'] = 0
df['compound_aadt'] = np.nan
# df_crashes_severe = df_crashes_severe.sample(2500)
# df_int = df_int.sample(2500)

# print(df_crashes_severe.shape)
int_buffer = df.geometry.buffer(25)

for i,row in tqdm(enumerate(df.geometry)):
#     print(sum(buffer_index) )
#     print(i)
#     df_crashes.geometry.buffer(50)
#     pdb.set_trace()
    buffer_index = int_buffer.contains(row)
#     dist_vector = df_crashes.geometry.distance(row).astype(float)
#     dist_vector = dist_vector.fillna(1000000)
    df.loc[i,'nearby_ints'] = sum(buffer_index) 
    df.loc[i,'compound_aadt'] = df.loc[buffer_index]['aadt'].mean()
#     print("-"*100)
#     print(df.loc[i,'compound_aadt'])
#     print(df.loc[i,'aadt'])
#     if i > 50:
#         break



In [None]:
# df_int.head(50)
df.to_csv('df_before_modeling.csv')

In [None]:
df.columns
# df['num_lanes'].unique()
# df["legs"].unique()
# df["footways"].sum()

In [None]:
df["float_maxspeed"] = df['maxspeed'].apply(lambda x : x.replace(" mph","") if x else None).apply(lambda x: x if x != "24.14" else None).astype(float)

In [None]:
df['highway_type'].unique()

In [None]:
rtypes = ['motorway', "service_road", 'trunk', 'motorway_link', 'primary',
       'primary_link', 'secondary', 'secondary_link', 'path', 'cycleway',
       'trunk_link', 'footway', 'construction']

rtypes = ['secondary', 'residential', 'primary', 'tertiary', 'construction',
       'living_street', 'service', '', 'pedestrian', 'cycleway',
       'unclassified', 'trunk', 'footway', 'proposed', 'motorway',
       'track', 'services']

for rtype in rtypes:
    df[rtype] = 0
    
for i,row in df.iterrows():
    df.loc[i,row['highway_type']] = 1
    
legs = ['2_leg','3_leg','4_leg','5_leg','many_leg']
for leg_type in legs:
    df[leg_type] = 0
    
for i,row in df.iterrows():
    
    if int(float(df.loc[i,'legs'])) == 2:
        df.loc[i,'2_leg'] = 1
    elif int(float(df.loc[i,'legs'])) == 3:
        df.loc[i,'3_leg'] = 1
    elif int(float(df.loc[i,'legs'])) == 4:
        df.loc[i,'4_leg'] = 1
    elif int(float(df.loc[i,'legs'])) == 5:
        df.loc[i,'5_leg'] = 1
    else:
        df.loc[i,'many_leg'] = 1



In [None]:
df['highway_type'].unique()

In [None]:
# df['modeled_aadt'] = 0

# for i,row in df.iterrows():
#     if not np.isnan(df.loc[i,"aadt"]):
#         print("Copying value for " +str(i))
#         df.loc[i,"modeled_aadt"]
#     else:
#         roadway_type = df.loc[i,"road_type"]
#         df.loc[i,"modeled_aadt"] = df[df["road_type"]==roadway_type]["aadt"].mean()

In [None]:
X.head(3)

In [None]:
import copy
df_save = copy.deepcopy(df)

In [None]:
X = df[['nearby_ints','min_angle','oneways','aadt','num_lanes','float_maxspeed','second_min_angle']+rtypes+legs].astype(float)
# Y = df['major_injury_crash_rate'] + df['fatal_crash_rate']#.apply(float)
Y = df['crash_rate']
# X['min_angle'] = X['min_angle'] = np.cos(X['min_angle'] * (np.pi/180))
# X['second_min_angle'] = X['second_min_angle'] = np.cos(X['second_min_angle'] * (np.pi/180))

In [None]:
Y.hist(bins=35)
plt.yscale("log")

In [None]:
# keep = X.loc[~X.isna().any(axis=1)].index
X_test.shape


In [None]:
print(Y.shape)
print(X.shape)
# Y = Y.loc[keep]
# X = X.loc[keep]
X_train = X[:20000]
Y_train = Y[:20000]
X_test = X[20000:]
Y_test = Y[20000:]

In [None]:
X_train["unclassified"].describe()
X.columns

In [None]:
model = xgb.XGBRegressor(n_estimators=100, max_depth=2, eta=0.15,reg_alpha=0.3,subsample=0.8)


model.fit(X_train,Y_train)

plt.hist(Y_test-model.predict(X_test),bins=45)
plt.title("Residuals of truth-prediction")
plt.yscale("log")
resids = Y_test-model.predict(X_test)
plt.show()
# print("RMS: " + str(np.sqrt(sum((Y_test-model.predict(X_test))**2)/len(X_test))))
print("std: " + str((Y_test-model.predict(X_test)).std(ddof=1)))
print("mean: " + str((Y_test-model.predict(X_test)).mean() ))
plt.hist((Y_test - Y_test.mean()),bins=35)
plt.title("Residuals using mean as prediction.")
plt.yscale("log")
plt.show()
print("std using mean as prediction :" +str((Y_test - Y_test.mean()).std(ddof=1)))

plt.hist(model.predict(X_test),bins=35)
plt.title("Prediction distribution.")
plt.yscale("log")
plt.show()
# print((Y_test - Y_test.mean()).mean())
print("std using mean as prediction :" +str((Y_test - Y_test.mean()).std(ddof=1)))
# print("RMS: " + str(np.sqrt(sum((Y_test-Y_test.mean())**2)/len(Y_test))))

plt.hist(Y_train-model.predict(X_train),bins=35)
plt.title('Residuals of truth-prediction for training data')
plt.yscale('log')
# plt.xlim(-5,30)

plt.show()
# print("RMS: " + str(np.sqrt(sum((Y_train-model.predict(X_train))**2)/len(X_train))))
print("std: " + str((Y_train-model.predict(X_train)).std(ddof=1)))
print("mean: " + str((Y_train-model.predict(X_train)).mean() ))

print(model.feature_importances_)

In [None]:
print("Train score : " + str(model.score(X_train, Y_train)))
print("Test score : " + str(model.score(X_test, Y_test)))

In [None]:
df["severe_crash_rate_prediction"] = model.predict(X[X_train.columns])
# X['crash_rate_measured'] = Y

In [None]:
df["crash_rate_prediction_err"] = 0.91

In [None]:
df.to_csv("DC_model_v2.csv")

In [None]:
X['severe_crash_rate_prediction'] = model.predict(X[X_train.columns])
X['severe_crash_rate_measured'] = Y

In [None]:
plt.figure(figsize=(13,10))
(df['major_injury_crash_rate'] + df['fatal_crash_rate']-df['severe_crash_rate_prediction']).hist(bins=35, rwidth=0.95)
plt.title("Distribution of historical-predicted major injury accidents per year",fontsize=22)
plt.xlabel("Accidents/Year",fontsize=18)
plt.yscale("log")

In [None]:
plt.figure(figsize=(13,10))
(df['crash_rate']-df['crash_rate_prediction']).hist(bins=35, rwidth=0.95)
plt.title("Distribution of historical-predicted accidents per year",fontsize=22)
plt.xlabel("Accidents/Year",fontsize=18)
plt.yscale("log")

In [None]:
plt.figure(figsize=(13,10))
(df['severe_crash_rate_prediction']).hist(bins=35, rwidth=0.95)
plt.title("Distribution of predicted major injury accidents per year",fontsize=22)
plt.xlabel("Accidents/Year",fontsize=18)
plt.yscale("log")

In [None]:
df[ (df['crash_rate']-df['crash_rate_prediction']) > 15 ]
df[(df['major_injury_crash_rate'] + df['fatal_crash_rate']-df['severe_crash_rate_prediction'])>2]

In [None]:
plt.figure(figsize=(13,10))
(df['crash_rate_prediction']).hist(bins=35, rwidth=0.95)
plt.title("Distribution of predicted total accidents per year",fontsize=22)
plt.xlabel("Accidents/Year",fontsize=18)
plt.yscale("log")

In [None]:
plt.figure(figsize=(13,10))
(df['crash_rate']).hist(bins=35, rwidth=0.95)
plt.title("Distribution of total accidents per year",fontsize=22)
plt.xlabel("Accidents/Year",fontsize=18)
plt.yscale("log")

In [None]:
plt.figure(figsize=(13,10))
(X['crash_rate_measured']-X['crash_rate_prediction']).hist(bins=35, rwidth=0.95)
plt.title("Distribution of major injury accidents - predicted per year",fontsize=22)
plt.xlabel("Accidents/Year",fontsize=18)
plt.yscale("log")



In [None]:
df['min_angle'].sum()

In [None]:
df.loc[X[(X['severe_measured']-X['severe_prediction']) > 1].index]

In [None]:
plt.hist(Y_test,bins=35)
plt.yscale("log")
plt.show()

In [None]:
plt.hist(model.predict(X_test),bins=35)
plt.yscale("log")
plt.show()

# Simple statistical analysis
Fit to a tweedie distribution. Convert angle to cosine(rads). Don't have to worry too much about overfitting, because N >> M (much more data than parameters -> deterministic solution). 

In [None]:
X.columns

In [None]:
# ~X.isna().any(axis=0)

In [None]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import scipy
import pdb

# def cast_to_float(row):
#     if row:
#         return float(row)
    
scaler = StandardScaler()

clf = linear_model.TweedieRegressor(power=1.01, alpha=0.01, fit_intercept=True, link='log')

# good_index = df_int['crash_count'].apply(cast_to_float).dropna().index
# df_int['distance_weighted_crash_count'] = df_int['distance_weighted_crash_count'].apply(float).fillna(0)
# df_int['num_legs_from_borderalgo'] = df_int['num_legs_from_borderalgo'].apply(cast_to_float).fillna(4)
# df_int['angle'] = df_int['angle'].apply(cast_to_float).fillna(0)
# df_int['oneway'] = df_int['oneway'].apply(cast_to_float).fillna(1)

# good_index = df_int.index
# good_index = df_int['crash_count'].apply(cast_to_float).dropna().index
# print(good_index)
# df_int.loc[good_index]['num_legs_from_borderalgo'] = df_int.loc[good_index]['num_legs_from_borderalgo'].fillna(4)
# good_index = df_int.loc[good_index]['num_legs'].apply(cast_to_float).dropna().index
# good_index = df_int.loc[good_index].dropna()
# # print(good_index.shape)
# Y = df['crash_count']#.apply(float)
# # Y = df_int.loc[good_index,'crash_'].apply(float)


# X = df_int.loc[good_index][['num_legs','angle','oneway']+rtypes+legs].astype(float)
# X['angle'] = np.cos(X['angle'] * (np.pi/180))
# print(Y.shape)
# print(X.shape)
X_temp = X[['min_angle', 'oneways', 'num_lanes', 'secondary',
       'residential', 'primary', 'tertiary', 'construction', 'living_street',
       'service', '', 'pedestrian', 'cycleway', 'unclassified', 'trunk',
       'footway', 'proposed', 'motorway', 'track', 'services', '2_leg',
       '3_leg', '4_leg', '5_leg', 'many_leg']]
Y_temp = Y.loc[~X_temp.isna().any(axis=1)]
X_temp = X_temp.loc[~X_temp.isna().any(axis=1)]

X_temp['const'] = 1
X_train = X_temp[:20000]
Y_train = Y_temp[:20000]
X_test = X_temp[20000:]
Y_test = Y_temp[20000:]
# scaler.fit(X_train[['num_legs_from_borderalgo','angle','oneway']])
# X_train[['num_legs_from_borderalgo','angle','oneway']] = scaler.transform(X_train[['num_legs_from_borderalgo','angle','oneway']])
# X_test[['num_legs_from_borderalgo','angle','oneway']] = scaler.transform(X_test[['num_legs_from_borderalgo','angle','oneway']])


# pdb.set_trace()
print(X_test.shape)

# print(len(X))
# clf.fi
result = clf.fit(X_train, Y_train)
print("Train score : " + str(clf.score(X_train, Y_train)))
print("Test score : " + str(clf.score(X_test, Y_test)))

# print(X.columns)
print(result.coef_)
# print(clf.predict(X_test))
# plt.hist(clf.predict(X_test),bins=10)
# plt.show()
# print(clf.predict(X_test).std())
# print(clf.predict(X_test))
plt.hist(Y_test-clf.predict(X_test),bins=25)
resids = Y_test-clf.predict(X_test)

best_fit_line = scipy.stats.norm.pdf(np.linspace(-100,300,1000), resids.mean(), resids.std(ddof=1))*8000

plt.plot(np.linspace(-100,300,1000), best_fit_line)
# plt.yscale('log')
# plt.xlim(-5,30)
plt.title("Residuals of truth-prediction")
plt.show()
print("std: " + str((Y_test-clf.predict(X_test)).std(ddof=1)))
print("mean: " + str((Y_test-clf.predict(X_test)).mean() ))
plt.hist(Y_test - Y_test.mean(),bins=35)
plt.title("Residuals using mean as prediction.")
plt.show()
# print((Y_test - Y_test.mean()).mean())
print("std using mean as prediction :" +str((Y_test - Y_test.mean()).std(ddof=1)))

plt.hist(Y_train-clf.predict(X_train),bins=35)
plt.title('Residuals of truth-prediction for training data')
# plt.yscale('log')
# plt.xlim(-5,30)
plt.show()


In [None]:
clf.predict(X_test).std()

In [None]:
plt.hist(clf.predict(X_temp))
plt.yscale("log")

In [None]:
plt.hist(Y_test,bins=30)
plt.show()

# I ad-hoc create the AI prediction columns in the df_int dataframe for export


In [None]:
# df_int['AI_fatal_crash_rate'] = clf.predict(X)/12.8

In [None]:
# df_int['AI_fatal_rate_err'] = 0.0125

In [None]:
# df_int.to_csv("Intersections_withCrashRates_WithAI_Preds.csv")

In [None]:
# [x for x in clf.predict(X_test)]
# X_test['']

# Only scratchpad work below this line.

In [None]:
plt.figure(figsize=(13,10))
bins = plt.hist( [ x*(10) for x in Y_test ] , rwidth=0.95)
# plt.xlim(1,6)
plt.yscale('log')
plt.xlabel("# of major injury crashes in 10 years",fontsize=22)

In [None]:
X_test['prediction'] = model.predict(X_test)

In [None]:
plt.figure(figsize=(13,10))
(X_test['prediction']*(10)).hist(bins=8,rwidth=0.95)
plt.yscale('log')
plt.xlabel("Predicted # major injury crashes in 10 years",fontsize=22)

In [None]:
X_safe = X_test[ X_test['prediction']*10 < 5]
X_danger = X_test[ X_test['prediction']*10 > 5]

In [None]:
X_safe.describe()

In [None]:
X_danger.describe()

In [None]:
# Is there a significant diference between angles?
# Z = (mu_1 - mu_2 / sqrt(sigma_1^2 + sigma_2^2))
Z = (1.525520 - 1.477102) / np.sqrt(0.460798**2 + 0.177618**2)
print(Z)
# No

In [None]:
df_int['num_legs_from_borderalgo'].apply(cast_to_float)

In [None]:
np.corrcoef(df_int['oneway'].apply(cast_to_float),df_int['crash_count'].astype(float))

In [None]:
df_int.to_csv('crash_model_dataframe.csv')

In [None]:
!pwd

In [None]:
# # pd.DataFrame()
# severe_columns = [x for x in df_crashes.columns if "FATAL" in x.upper() or "MAJOR" in x.upper()]
# df_crashes_fatal = df_crashes[ pd.DataFrame.any(df_crashes[severe_columns].astype(int) > 0,axis=1) ]

#     df_int_dict = df_int.to_dict('records')
# df_crashes_dict = df_crashes_fatal.to_dict('records')

# crash_mapping = []

# print("Beginning loop...")
# for i, intersection in enumerate(df_int_dict):
#     crash_count = 0
#     crash_ids = []
#     for j, crash in enumerate(df_crashes_dict):
#         distance = geo_distance((intersection['latitude'],intersection['longitude']),
#                                 (crash['latitude'],crash['longitude']))
        
#         radius = distance.m
        
#         if radius < I 50:
#             crash_count += 1
#             crash_ids.append(crash['objectid'])
#     print("Intersection #: " + str(i))
#     print("crash_count:" + str(crash_count))
#     crash_mapping.append((intersection['nodeid'],crash_ids)) 
    
#     if i > 1000:
#         break
    
# crashes

# Below is from an earlier exploration on poisson fits
The data is two skewed to fit to a poisson. Once the data is normalized by traffic volume this may be worth revisiting.

In [None]:
# def is_severe(row):
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from scipy.stats import poisson

In [None]:
def _ll_poisson(y, X, beta, alph):
    """
    Poisson = (lambda^N*exp(-lambda))/N!
    """
    mu = np.exp(np.dot(X, beta))
    size = 1/alph
    prob = size/(size+mu)
    ll = nbinom.logpmf(y, size, prob)
    ll = poisson.logpmf(y,)
    return ll

class Poisson(GenericLikelihoodModel):
    def __init__(self, endog, exog, **kwds):
        super(NBin, self).__init__(endog, exog, **kwds)

    def nloglikeobs(self, params):
        alph = params[-1]
        beta = params[:-1]
        ll = _ll_nb2(self.endog, self.exog, beta, alph)
        return -ll

    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
        # we have one additional parameter and we need to add it for summary
        self.exog_names.append('alpha')
        if start_params == None:
            # Reasonable starting values
            start_params = np.append(np.zeros(self.exog.shape[1]), .5)
            # intercept
            start_params[-2] = np.log(self.endog.mean())
        return super(NBin, self).fit(start_params=start_params,
                                     maxiter=maxiter, maxfun=maxfun,
                                     **kwds)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from scipy.special import factorial
from scipy import stats
import copy

def poisson(k, lamb):
    """poisson pdf, parameter lamb is the fit parameter"""
    return (lamb**k/factorial(k)) * np.exp(-lamb)


def negative_log_likelihood(params, data):
    """
    The negative log-Likelihood-Function
    """

    lnl = - np.sum(np.log(poisson(data, params[0])))
    return lnl

def negative_log_likelihood(params, data):
    ''' better alternative using scipy '''
    return -stats.poisson.logpmf(data, params[0]).sum()


# get poisson deviated random numbers
# data = np.random.poisson(1.2, 1000)
data = df_int[df_int['num_legs'].apply(cast_to_float)==4]['crash_count'].apply(cast_to_float).dropna()
print(data)

# minimize the negative log-Likelihood

result = minimize(negative_log_likelihood,  # function to minimize
                  x0=np.ones(1),            # start value
                  args=(data,),             # additional arguments for function
                  method='Powell',          # minimization method, see docs
                  )
# result is a scipy optimize result object, the fit parameters 
# are stored in result.x
print(result)
# print(dir(result))
func_min = result.fun

scan_value = func_min
scan_parameter = copy.deepcopy(result.x)
while scan_value < 2*func_min:
    scan_value = negative_log_likelihood(scan_parameter,data)
    scan_parameter[0] += 0.2
print("1Sigma value is :" )
print(scan_parameter)
print(scan_value)
    
# plot poisson-distribution with fitted parameter
x_plot = np.arange(0, 35)

plt.plot(
    x_plot,
    stats.poisson.pmf(x_plot, result.x[0]),
    marker='o', linestyle='',
    label='Fit result',
)
plt.plot(
    x_plot,
    stats.poisson.pmf(x_plot, scan_parameter[0]),
    marker='x', linestyle='',
    label='Uncertainty result'
)
plt.hist(df_int['crash_count'].apply(cast_to_float),density=True,bins=35,label='Data')
plt.legend()
plt.show()

In [None]:
stats.poisson.pmf(x_plot, scan_value)

In [None]:
index = df_int['crash_count'].apply(cast_to_float).dropna().index
df_int.loc[index,'crash_count'].apply(cast_to_float)
df_int.loc[index][['num_legs_from_borderalgo','angle']].astype(float)

In [None]:
np.sqrt(clf.family.unit_variance(3))

In [None]:
import tweedie, seaborn as sns, matplotlib.pyplot as plt

mu = 3.5
phi = np.sqrt(clf.family.unit_variance(mu)/mu**1.5)
phi = np.sqrt(clf.family.unit_variance(mu))


tvs = tweedie.tweedie(mu=mu, p=1.5, phi=phi).rvs(10000)
plt.hist(tvs,bins=50,density=True)
# plt.yscale('log')
# ax = sns.kdeplot(tvs,bw=0.05)
plt.show()

In [None]:
rvs = tweedie.tweedie.rvs(1.5,5,3,size=40)
results = tweedie.tweedie.fit(rvs)

In [None]:
results

In [None]:
plt.plot( tweedie.tweedie.pdf(np.linspace(0,100,num=60),results[-1],results[1],results[2]) )
plt.show()

In [None]:
plt.plot( tweedie.tweedie.pdf(np.linspace(0,100,num=60),1.5,5,3) )
plt.show()