In [5]:
import pandas as pd
import numpy as np
import pickle
import random

##sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score

from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline

import python_files.functions_barcelona_2 as fb2

%load_ext autoreload
%autoreload 2

with open('./data/model_charac.pkl','rb') as file:
    model_dict=pickle.load(file)
model_dict


oversamplers=fb2.oversamplings
undersamplers=fb2.undersamplings
metrics=fb2.metrics
model_dict['models']=fb2.models_dict
model_dict['models']
# model_dict['metrics']=[{'recall':recall_score}]
model_dict['metrics']=metrics
num_features=[col[0] for col in model_dict['numerical_features'].items() if col[1]>1]
cat_features=[col[0] for col in model_dict['categorical_features'].items() if col[1]>1]
acc=pd.read_csv('./data/accidents_weather_eng_2023.csv')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
X=acc[num_features +cat_features]
y=acc.target
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=2024)

In [7]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore",drop='first'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)


In [None]:
df_scores=pd.DataFrame()
for key in model_dict['models'].keys():
    #print(dicty['model'])
    pipe = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier",model_dict['models'][key])])
    pipe.fit(X_train,y_train)
    preds=pipe.predict(X_test)
    for met in model_dict['metrics'].keys():
        if met !='precision':
        #print(key, model_dict['metrics'][met](y_test,preds))
            df_scores.loc[key,met]=model_dict['metrics'][met](y_test,preds)
        else:
            df_scores.loc[key,met]=precision_score(y_test,preds, zero_division=1)
    
df_scores

In [None]:
#adding resampling
df_scores_over=pd.DataFrame()
for key in model_dict['models'].keys():
    #print(dicty['model'])
    for resampler in oversamplers.keys():
        pipe = imbPipeline(
            steps=[("preprocessor", preprocessor),('resampler',oversamplers[resampler]),("classifier",model_dict['models'][key])])
        pipe.fit(X_train,y_train)
        preds=pipe.predict(X_test)
        print(key, resampler)
        for met in model_dict['metrics'].keys():
            if met !='precision':
            #print(key, model_dict['metrics'][met](y_test,preds))
                df_scores_over.loc[key+'_'+resampler,met]=model_dict['metrics'][met](y_test,preds)
            else:
                
                df_scores_over.loc[key+'_'+resampler,met]=precision_score(y_test,preds, zero_division=1)
    
df_scores_over

In [None]:
#adding resampling
df_scores_under=pd.DataFrame()
for key in model_dict['models'].keys():
    #print(dicty['model'])
    for resampler in undersamplers.keys():
        pipe = imbPipeline(
            steps=[("preprocessor", preprocessor),('resampler',undersamplers[resampler]),("classifier",model_dict['models'][key])])
        pipe.fit(X_train,y_train)
        preds=pipe.predict(X_test)
        print(key, resampler)
        for met in model_dict['metrics'].keys():
            if met !='precision':
            #print(key, model_dict['metrics'][met](y_test,preds))
                df_scores_under.loc[key+'_'+resampler,met]=model_dict['metrics'][met](y_test,preds)
            else:
                
                df_scores_under.loc[key+'_'+resampler,met]=precision_score(y_test,preds, zero_division=1)
    
df_scores_under

In [None]:
pd.concat([df_scores,df_scores_over,df_scores_under]).to_csv('df_results.csv')

In [None]:
import pandas as pd
results=pd.read_csv('df_results.csv',index_col=[0])
results.head()

In [None]:
results[results.accuracy>2/3].sort_values('recall',ascending=False).head()

In [7]:
##going with logreg and REN or OSS
num_features=[col[0] for col in model_dict['numerical_features'].items() if col[1]>1]
cat_features=[col[0] for col in model_dict['categorical_features'].items() if col[1]>1]
num_features

['people_role_pedestrian',
 'vehicle_motorcycle',
 'driver_u_25_count',
 'num_vehicles',
 'gender_driver_male_count',
 'vehicle_bus',
 'street_name_count']

In [8]:
X=acc[num_features +cat_features]
y=acc.target
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=2024)

In [20]:
num_features=[col[0] for col in model_dict['numerical_features'].items() if col[1]>1]
cat_features=[col[0] for col in model_dict['categorical_features'].items() if col[1]>1]

X=acc[num_features +cat_features]
y=acc.target
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=2024)
model=LogisticRegression(random_state=2024,class_weight='balanced',max_iter=500,solver='newton-cholesky',penalty=None)

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore",drop='first'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)

pipe = imbPipeline(
            steps=[("preprocessor", preprocessor),('resampler',undersamplers['OSS']),("classifier",model_dict['models']['logreg'])])
pipe.fit(X_train,y_train)
preds=pipe.predict(X_test)
recall=recall_score(y_test,preds)
accuracy=accuracy_score(y_test,preds)
recall, accuracy

(0.7078651685393258, 0.7005604990658348)

In [None]:
num_features=[col[0] for col in model_dict['numerical_features'].items() if col[1]>1]
cat_features=[col[0] for col in model_dict['categorical_features'].items() if col[1]>1]

X=acc[num_features +cat_features]
y=acc.target
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=2024,stratify=y)
pipe = imbPipeline(
            steps=[("preprocessor", preprocessor),('resampler',undersamplers['OSS']),("classifier",LogisticRegression(random_state=2024,class_weight='balanced',max_iter=500))])
strat=StratifiedKFold(n_splits=5,
                     shuffle=True,
                     random_state=2024)
param_grid={'classifier__C': [1e6,1e9,1e12]}
gs=GridSearchCV(pipe,
               param_grid=param_grid,
               scoring='recall',
               cv=strat)
gs.fit(X_train,y_train)

In [14]:
acc.accident_type.unique()

array(['run_over', 'crash', 'frontal', 'collision', 'fall--motorcycle',
       'fall_inside_vehicle', 'misc_type', 'overturning', 'run-off'],
      dtype=object)

In [4]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore",drop='first'))
    ]
)

In [15]:
accuracies=[]
recalls=[]
numerical_features=[col[0] for col in model_dict['numerical_features'].items() if col[1]>1]
categorical_features=[col[0] for col in model_dict['categorical_features'].items() if col[1]>1]

total_features=numerical_features+categorical_features
for num in range(1,len(total_features)):
    shift_recall=[]
    shift_accuracy=[]
    for tm in range(10):    
        choices=random.sample(total_features,k=num)
        
        num_features=[col for col in choices if col in numerical_features]
        cat_features=[col for col in choices if col in categorical_features]
        X_train,X_test,y_train,y_test=train_test_split(acc[choices],acc.target,random_state=2024,stratify=acc.target)
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, num_features),
                ("cat", categorical_transformer, cat_features),
            ]
        )
        
        

        pipe = imbPipeline(
            steps=[("preprocessor", preprocessor),('resampler',undersamplers['OSS']),("classifier",model_dict['models']['logreg'])])
        pipe.fit(X_train,y_train)
        preds=pipe.predict(X_test)
        shift_recall.append(recall_score(y_test,preds))
        shift_accuracy.append(accuracy_score(y_test,preds))
    print(num)
    accuracies.append(sum(shift_accuracy)/len(shift_accuracy))
    recalls.append(sum(shift_recall)/len(shift_recall))
        
recalls

1
2
3
4
5
6
7
8
9
10
11


[0.33754789272030655,
 0.4652618135376757,
 0.5334610472541507,
 0.5389527458492975,
 0.5883780332056194,
 0.5936143039591315,
 0.6108556832694764,
 0.598595146871009,
 0.5933588761174968,
 0.6450830140485313,
 0.6509578544061303]

In [6]:
recalls

[0.3478075776926352,
 0.6334610472541508,
 0.44870157513835673,
 0.5061728395061729,
 0.6266496381438911,
 0.6206896551724138,
 0.5913154533844188,
 0.604512558535547,
 0.6492124308216262,
 0.6479352916134525,
 0.6419753086419753]

In [11]:
len(total_features)

12