In [None]:
import copy
import json
import math
import os
import time
import warnings

from frechetdist import frdist
from IPython.utils import io
import matplotlib.pyplot as plt
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
from rtree import index
import scipy.spatial as spatial
from scipy.interpolate import interp1d
from shapely.geometry import LineString, Point
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree._tree import TREE_LEAF
from sklearn.tree import export_graphviz
from six import StringIO  
import pydotplus
from IPython.display import Image  

In [None]:
## import sumolib
import sys
sys.path.append(r'/mnt/c/users/ty90rize/Sumo/tools') # path to sumolib
sys.path.append(r'/mnt/c/users/ty90rize/repos/roadmatching/labelling_tool') # path to labelling tool
import sumolib
import preprocessing
import strokeutils
import similarity
import inference_utils as iu


path_train_osm_network = 'oktober_data/small_map/osm_sumo_data/2023-10-09-11-40-39/osm.net.xml/osm.net.xml'
path_train_tomtom_network = 'oktober_data/small_map/tomtom_sumo_data/tomtom_sumo_small.net.xml'

out_dir = 'out_dir' # where to put training file with similarity calculations
preprocessed_file = 'preprocessed_df_2023-12-13.json'

In [None]:
net_osm_train = sumolib.net.readNet(path_train_osm_network)
net_tomtom_train = sumolib.net.readNet(path_train_tomtom_network)

## Preprocessing

In [None]:
from pathlib import Path

preprocess = True
if Path(os.path.join(os.path.dirname(os.path.realpath('__file__')), out_dir, preprocessed_file)).is_file():
    preprocess = False

In [None]:
preprocess

In [None]:
def create_candidate_df(net_osm, net_tomtom, radius, tomtom_rtree_idx, edgelist = []):

    candidate_dfs = []
    candidate_features = []
    
    if edgelist == []:
        edgelist = net_osm.getEdges(withInternal=False)
    else:
        edgelist = [net_osm.getEdge(e) for e in copy.deepcopy(edgelist)]

    
    counter = 0
    time_prev = time.time()

    for edge in edgelist:
        if counter < 100:
            if counter < 10:
                print('counter: {}'.format(counter))
            if counter % 10 == 0:
                now = time.time()
                time_passed = now - time_prev
                print('passed last 10 iteration in: {}'.format(round(time_passed, 2)))
                time_prev = now

        if counter % 100 == 0:
            now = time.time()
            time_passed = now - time_prev
            print('passed last 100 iteration in: {}'.format(round(time_passed, 2)))
            time_prev = now
        reference_coordinates = preprocessing.get_transformed_coordinates(edge, net_osm, net_tomtom)
        reference_id = edge.getID()
        candidates = iu.get_candidates(reference_id, reference_coordinates, radius, tomtom_rtree_idx, net_tomtom, net_osm, False)
        tomtom_ids = list(candidates.keys())  
        if len(tomtom_ids) > 0:
            osm_names = [reference_id for i in range(len(tomtom_ids))]
            candidate_df = pd.DataFrame({'OSM':osm_names, 'Tomtom':tomtom_ids})

            feature_names = candidates[tomtom_ids[0]].keys()
            for name in feature_names:
                candidate_df[name] = [candidates[tomtom_ids[i]][name] for i in range(len(tomtom_ids))]            

            candidate_dfs.append(candidate_df)
        counter += 1
                
    candidate_df = pd.concat(candidate_dfs).reset_index(drop = True)
    
    return candidate_df

In [None]:
def save_to_json(df, filename, target_dir = out_dir):
    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)
    df.to_json(os.path.join(target_dir, filename))

def preprocess_data(net_osm, net_tomtom, radius, tomtom_rtree_idx, preprocessed_file, osm_edges = []):
    candidate_df = create_candidate_df(net_osm, net_tomtom, radius, tomtom_rtree_idx, osm_edges)
    print(preprocessed_file)
    save_to_json(candidate_df, preprocessed_file)
    return candidate_df

def get_features(preprocess, net_osm, net_tomtom, radius, out_dir, preprocessed_file, osm_edges = []):
    if preprocess:
        tomtom_rtree_idx = iu.create_tomtom_index(net_tomtom)
        print('completed tomtom rtree idx')
        with io.capture_output() as captured:
            candidate_df = preprocess_data(net_osm, net_tomtom, radius, tomtom_rtree_idx, preprocessed_file, osm_edges)
    else:
        candidate_df = pd.read_json(os.path.join(out_dir, preprocessed_file))
        candidate_df['Tomtom'] = candidate_df['Tomtom'].astype('str')

    return candidate_df

In [None]:
radius = 10

In [None]:
warnings.filterwarnings('ignore')
    
candidate_df = get_features(preprocess, net_osm_train, net_tomtom_train, radius, out_dir, preprocessed_file)

## Build ground truth

In [None]:
relative_filedir = 'labelling_tool/data_fin'

In [None]:
def build_ground_truth_df(relative_filedir):
    files = os.listdir(relative_filedir)
    dfs = []
    for file in files:
        filepath = os.path.join(relative_filedir, file)
        df = pd.read_csv(filepath)
        dfs.append(df)
    df = pd.concat(dfs).reset_index(drop = True)
    df['matches'] = 1
    df['OSM'] = df['OSM'].astype('str')
    df['Tomtom'] = df['Tomtom'].astype('str')
    return df

In [None]:
df = build_ground_truth_df(relative_filedir)
df

In [None]:
osm_edges_labelled = df['OSM'].unique()
candidates_filtered = candidate_df[candidate_df['OSM'].isin(osm_edges_labelled)]

In [None]:
ml_df = pd.merge(candidates_filtered, df, how = 'left', left_on = ['OSM', 'Tomtom'], right_on = ['OSM', 'Tomtom'])
ml_df['matches'] = ml_df['matches'].fillna(0) # ensure that NaNs become 0s
ml_df['matches'] = ml_df['matches'].astype(int)
ml_df

In [None]:
distribution = ml_df.groupby(['matches']).agg('count')
distribution

In [None]:
ml_df_orig = copy.deepcopy(ml_df)

In [None]:
ml_df = ml_df[ml_df['overlap_shorter'] >= 0.3]

In [None]:
ml_df.groupby(['matches']).agg('count')

## ML Training

In [None]:
%load_ext autoreload
%autoreload 2

# Import imblearn and other necessary packages
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Assuming you have a dataset named 'test_df' with a target column 'target'
# Make sure to replace 'test_df' and 'target' with your actual dataset and target column name

feature_columns = ['sinuosity_sim', 'cosine_sim', 'hausdorff_mod'
                  ]

# Split the dataset into features and target
X = ml_df[feature_columns]
feature_names = X.columns
y = ml_df['matches']

oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Initialize the Decision Tree Classifier
md = 3
clf = DecisionTreeClassifier(max_depth = md, min_samples_split = 10, criterion='entropy',) #class_weight = 'balanced')
#clf = GradientBoostingClassifier(n_estimators = 25, max_depth = 2, min_samples_split = 10, learning_rate = 0.25)

# Define the metrics you want to evaluate
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'balanced_accuracy', 'neg_brier_score', 'f1_macro', 'f1_weighted']

# Perform 5-fold cross-validation and evaluate using the specified metrics
cv_results = cross_validate(clf, X_resampled, y_resampled, cv=5, scoring=metrics)

# Print the results
for metric in metrics:
    #print(f"{metric.capitalize()} Scores for Each Fold:")
    #print(cv_results[f'test_{metric}'])
    
    # Calculate and print the mean score
    mean_score = cv_results[f'test_{metric}'].mean()
    print(f"Mean {metric.capitalize()}: {mean_score}")

In [None]:
clf = DecisionTreeClassifier(max_depth = md, min_samples_split = 10, criterion='entropy', #class_weight = 'balanced'
                            )

### Save model

In [None]:
from datetime import datetime
now = datetime.now()
time = now.strftime("%y%m%d_%H%M%S")

from joblib import dump
dump(clf, os.path.join(out_dir, 'dt_model_wo_strokes_{}.joblib'.format(time)))