In [1]:
import numpy as np

import tsplib95
import networkx as nx

import os
import pandas as pd
import gilsrvnd
import DBMEA
import grasp

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from networkx.algorithms.approximation import christofides

import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

In [2]:
# load matrix_good.csv
df = pd.read_csv('matrix_good.csv')
df

Unnamed: 0,file,n,m,ratio1,ratio2,density,min,max,mean,std,var,GILS_cost,GILS_time,GRASP_cost,GRASP_time,DBMEA_cost,DBMEA_time,Min_val,Min_col,min_method
0,att48.tsp,48,1176,0.040816,24.5,1.042553,0.0,2662.0,1017.559896,627.598414,3.938798e+05,226229.0,6.474685,301702.0,0.026019,207771.0,5.288543,0,,DBMEA_cost
1,berlin52.tsp,52,1378,0.037736,26.5,1.039216,0.0,1716.0,564.188609,345.722052,1.195237e+05,145419.0,6.054587,191072.0,0.031533,143278.0,5.374160,0,,DBMEA_cost
2,brazil58.tsp,58,1711,0.033898,29.5,1.035088,0.0,8700.0,2094.914388,1426.139588,2.033874e+06,532454.0,8.013889,709540.0,0.043413,535150.0,9.045840,0,,GILS_cost
3,burma14.tsp,14,105,0.133333,7.5,1.153846,1.0,1261.0,442.612245,272.511473,7.426250e+04,16457.0,0.156218,18393.0,0.001000,16457.0,0.082072,0,,GRASP_time
4,dantzig42.tsp,42,903,0.046512,21.5,1.048780,0.0,192.0,72.295918,43.481585,1.890648e+03,12392.0,2.579336,14523.0,0.016044,12256.0,3.325098,0,,DBMEA_cost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,line_90_5.tsp,90,4095,0.021978,45.5,1.022472,0.0,889.0,319.538272,226.908988,5.148769e+04,55730.0,35.477098,157350.0,0.233857,57558.0,36.326975,0,,GILS_cost
336,line_90_6.tsp,90,4095,0.021978,45.5,1.022472,0.0,888.0,297.687407,211.917802,4.490915e+04,48884.0,30.191444,128976.0,0.232372,45916.0,37.348990,0,,DBMEA_cost
337,line_90_7.tsp,90,4095,0.021978,45.5,1.022472,0.0,862.0,249.932840,181.712429,3.301941e+04,51464.0,28.821121,124368.0,0.232510,49038.0,44.483338,0,,DBMEA_cost
338,line_90_8.tsp,90,4095,0.021978,45.5,1.022472,0.0,889.0,316.810123,226.894830,5.148126e+04,60845.0,28.282006,123373.0,0.231058,65825.0,39.745050,0,,GILS_cost


In [3]:
# select the columns file, GILS_cost, DBMEA_cost, GRASP_cost, GILS_time, DBMEA_time, GRASP_time, min_method
df = df[['file', 'GILS_cost', 'DBMEA_cost', 'GRASP_cost', 'GILS_time', 'DBMEA_time', 'GRASP_time', 'min_method']]
df

Unnamed: 0,file,GILS_cost,DBMEA_cost,GRASP_cost,GILS_time,DBMEA_time,GRASP_time,min_method
0,att48.tsp,226229.0,207771.0,301702.0,6.474685,5.288543,0.026019,DBMEA_cost
1,berlin52.tsp,145419.0,143278.0,191072.0,6.054587,5.374160,0.031533,DBMEA_cost
2,brazil58.tsp,532454.0,535150.0,709540.0,8.013889,9.045840,0.043413,GILS_cost
3,burma14.tsp,16457.0,16457.0,18393.0,0.156218,0.082072,0.001000,GRASP_time
4,dantzig42.tsp,12392.0,12256.0,14523.0,2.579336,3.325098,0.016044,DBMEA_cost
...,...,...,...,...,...,...,...,...
335,line_90_5.tsp,55730.0,57558.0,157350.0,35.477098,36.326975,0.233857,GILS_cost
336,line_90_6.tsp,48884.0,45916.0,128976.0,30.191444,37.348990,0.232372,DBMEA_cost
337,line_90_7.tsp,51464.0,49038.0,124368.0,28.821121,44.483338,0.232510,DBMEA_cost
338,line_90_8.tsp,60845.0,65825.0,123373.0,28.282006,39.745050,0.231058,GILS_cost


In [4]:
# get unique file names
files = df['file'].unique()
files

array(['att48.tsp', 'berlin52.tsp', 'brazil58.tsp', 'burma14.tsp',
       'dantzig42.tsp', 'eil101.tsp', 'eil51.tsp', 'eil76.tsp',
       'gr48.tsp', 'gr96.tsp', 'hk48.tsp', 'kroA100.tsp', 'kroB100.tsp',
       'kroC100.tsp', 'kroD100.tsp', 'kroE100.tsp', 'lin105.tsp',
       'mod_att48.tsp', 'mod_berlin52.tsp', 'mod_burma14.tsp',
       'mod_eil101.tsp', 'mod_eil51.tsp', 'mod_eil76.tsp', 'mod_gr96.tsp',
       'mod_kroA100.tsp', 'mod_kroB100.tsp', 'mod_kroC100.tsp',
       'mod_kroD100.tsp', 'mod_kroE100.tsp', 'mod_lin105.tsp',
       'mod_pr107.tsp', 'mod_pr76.tsp', 'mod_rat99.tsp', 'mod_st70.tsp',
       'pr107.tsp', 'pr76.tsp', 'random_100_0.tsp', 'random_100_1.tsp',
       'random_100_2.tsp', 'random_100_3.tsp', 'random_100_4.tsp',
       'random_100_5.tsp', 'random_100_6.tsp', 'random_100_7.tsp',
       'random_100_8.tsp', 'random_100_9.tsp', 'random_10_0.tsp',
       'random_10_1.tsp', 'random_10_2.tsp', 'random_10_3.tsp',
       'random_10_4.tsp', 'random_10_5.tsp', 'random_10_

In [9]:
# create dict for all the characterstics
characteristics = {"file":[], "n": [], "m": [], "ratio1": [], "ratio2": [], "density": [], "avg_clustering": [], "mst_size": [], "tsp_approx":[],
                   "edge_min": [], "edge_max": [], "edge_mean": [], "edge_std": [], "edge_var": [], "edge_q25": [], "edge_q75": [],
                   "between_min": [], "between_max": [], "between_mean": [], "between_std": [], "between_var": [], "between_q25": [], "between_q75": [],
                   "closeness_min": [], "closeness_max": [], "closeness_mean": [], "closeness_std": [], "closeness_var": [], "closeness_q25": [], "closeness_q75": []}

In [10]:
def tspCost(graph, dist_matrix):
    t = christofides(graph)
    # remove last index of t
    t = [x - 1 for x in t]
    t.pop()
    t = list(t)

    cost = sum((len(t) - i - 1) * dist_matrix[t[i], t[i + 1]] for i in range(len(t) - 1))
    return cost

In [11]:
for f in files:

    if f == "swiss42.tsp":
        continue

    problem = tsplib95.load('Instances/' + f)
    graph = problem.get_graph()
    dist_matrix = nx.to_numpy_matrix(graph)
    dist_array = nx.to_numpy_array(graph)

    n = len(graph.nodes)

    # get the number of edges
    m = len(graph.edges)

    # get the ratio of nodes to edges and vice versa
    ratio1 = n/m
    ratio2 = m/n
    # get the density of the graph
    density = nx.density(graph)

    # edge cost statistics
    edge_min = np.min(dist_matrix)
    edge_max = np.max(dist_matrix)
    edge_mean = np.mean(dist_matrix)
    edge_std = np.std(dist_matrix)
    edge_var = np.var(dist_matrix)
    edge_q25 = np.quantile(dist_array, 0.25)
    edge_q75 = np.quantile(dist_array, 0.75)

    # local clustering statistics
    avg_clustering = nx.average_clustering(graph, weight='weight')

    # minimum spanning tree size
    mst = nx.minimum_spanning_tree(graph, weight='weight')
    mst_size = len(mst.edges)
    tsp_approx = tspCost(graph, dist_matrix)

    # betweenness centrality statistics
    between = nx.betweenness_centrality(graph, weight='weight')
    between_min = np.min(list(between.values()))
    between_max = np.max(list(between.values()))
    between_mean = np.mean(list(between.values()))
    between_std = np.std(list(between.values()))
    between_var = np.var(list(between.values()))
    between_q25 = np.quantile(list(between.values()), 0.25)
    between_q75 = np.quantile(list(between.values()), 0.75)

    # closeness centrality statistics
    closeness = nx.closeness_centrality(graph, distance='weight')
    closeness_min = np.min(list(closeness.values()))
    closeness_max = np.max(list(closeness.values()))
    closeness_mean = np.mean(list(closeness.values()))
    closeness_std = np.std(list(closeness.values()))
    closeness_var = np.var(list(closeness.values()))
    closeness_q25 = np.quantile(list(closeness.values()), 0.25)
    closeness_q75 = np.quantile(list(closeness.values()), 0.75)

    # add information to the dictionary
    characteristics["file"].append(f)
    characteristics["n"].append(n)
    characteristics["m"].append(m)
    characteristics["ratio1"].append(ratio1)
    characteristics["ratio2"].append(ratio2)
    characteristics["density"].append(density)
    characteristics["avg_clustering"].append(avg_clustering)
    characteristics["mst_size"].append(mst_size)
    characteristics["tsp_approx"].append(tsp_approx)
    characteristics["edge_min"].append(edge_min)
    characteristics["edge_max"].append(edge_max)
    characteristics["edge_mean"].append(edge_mean)
    characteristics["edge_std"].append(edge_std)
    characteristics["edge_var"].append(edge_var)
    characteristics["edge_q25"].append(edge_q25)
    characteristics["edge_q75"].append(edge_q75)
    characteristics["between_min"].append(between_min)
    characteristics["between_max"].append(between_max)
    characteristics["between_mean"].append(between_mean)
    characteristics["between_std"].append(between_std)
    characteristics["between_var"].append(between_var)
    characteristics["between_q25"].append(between_q25)
    characteristics["between_q75"].append(between_q75)
    characteristics["closeness_min"].append(closeness_min)
    characteristics["closeness_max"].append(closeness_max)
    characteristics["closeness_mean"].append(closeness_mean)
    characteristics["closeness_std"].append(closeness_std)
    characteristics["closeness_var"].append(closeness_var)
    characteristics["closeness_q25"].append(closeness_q25)
    characteristics["closeness_q75"].append(closeness_q75)

    print(f'{f} done')



att48.tsp done
berlin52.tsp done
brazil58.tsp done
burma14.tsp done
dantzig42.tsp done
eil101.tsp done
eil51.tsp done
eil76.tsp done
gr48.tsp done
gr96.tsp done
hk48.tsp done
kroA100.tsp done
kroB100.tsp done
kroC100.tsp done
kroD100.tsp done
kroE100.tsp done
lin105.tsp done
mod_att48.tsp done
mod_berlin52.tsp done
mod_burma14.tsp done
mod_eil101.tsp done
mod_eil51.tsp done
mod_eil76.tsp done
mod_gr96.tsp done
mod_kroA100.tsp done
mod_kroB100.tsp done
mod_kroC100.tsp done
mod_kroD100.tsp done
mod_kroE100.tsp done
mod_lin105.tsp done
mod_pr107.tsp done
mod_pr76.tsp done
mod_rat99.tsp done
mod_st70.tsp done
pr107.tsp done
pr76.tsp done
random_100_0.tsp done
random_100_1.tsp done
random_100_2.tsp done
random_100_3.tsp done
random_100_4.tsp done
random_100_5.tsp done
random_100_6.tsp done
random_100_7.tsp done
random_100_8.tsp done
random_100_9.tsp done
random_10_0.tsp done
random_10_1.tsp done
random_10_2.tsp done
random_10_3.tsp done
random_10_4.tsp done
random_10_5.tsp done
random_10_6.

In [12]:
chars = pd.DataFrame.from_dict(characteristics)
chars

Unnamed: 0,file,n,m,ratio1,ratio2,density,avg_clustering,mst_size,tsp_approx,edge_min,...,between_var,between_q25,between_q75,closeness_min,closeness_max,closeness_mean,closeness_std,closeness_var,closeness_q25,closeness_q75
0,att48.tsp,48,1176,0.040816,24.5,1.042553,0.345227,47,333391.0,0.0,...,0.000008,0.022051,0.026965,0.000574,0.001325,0.001012,0.000205,4.221718e-08,0.000889,0.001191
1,berlin52.tsp,52,1378,0.037736,26.5,1.039216,0.301939,51,209005.0,0.0,...,0.000065,0.021686,0.032662,0.000919,0.002555,0.001893,0.000507,2.566472e-07,0.001447,0.002371
2,brazil58.tsp,58,1711,0.033898,29.5,1.035088,0.213223,57,3344980.0,0.0,...,0.000591,0.020050,0.042040,0.000201,0.000736,0.000562,0.000145,2.104825e-08,0.000458,0.000692
3,burma14.tsp,14,105,0.133333,7.5,1.153846,0.341466,13,24377.0,1.0,...,0.000097,0.000000,0.006410,0.001313,0.002677,0.002193,0.000407,1.654824e-07,0.002048,0.002518
4,dantzig42.tsp,42,903,0.046512,21.5,1.048780,0.346021,41,14880.0,0.0,...,0.000788,0.033226,0.062177,0.008369,0.019061,0.014587,0.003118,9.721282e-06,0.011758,0.016824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,line_90_5.tsp,90,4095,0.021978,45.5,1.022472,0.288284,89,69594.0,0.0,...,0.000435,0.050520,0.072751,0.002192,0.004113,0.003227,0.000642,4.123292e-07,0.002620,0.003881
335,line_90_6.tsp,90,4095,0.021978,45.5,1.022472,0.269990,89,43838.0,0.0,...,0.000730,0.049839,0.071527,0.002220,0.004410,0.003481,0.000709,5.026715e-07,0.002873,0.004060
336,line_90_7.tsp,90,4095,0.021978,45.5,1.022472,0.238308,89,82432.0,0.0,...,0.000393,0.048856,0.069497,0.002238,0.005424,0.004240,0.000992,9.846681e-07,0.003531,0.005157
337,line_90_8.tsp,90,4095,0.021978,45.5,1.022472,0.282686,89,84499.0,0.0,...,0.000680,0.049190,0.072960,0.002171,0.004137,0.003256,0.000649,4.205608e-07,0.002625,0.003914


In [13]:
df

Unnamed: 0,file,GILS_cost,DBMEA_cost,GRASP_cost,GILS_time,DBMEA_time,GRASP_time,min_method
0,att48.tsp,226229.0,207771.0,301702.0,6.474685,5.288543,0.026019,DBMEA_cost
1,berlin52.tsp,145419.0,143278.0,191072.0,6.054587,5.374160,0.031533,DBMEA_cost
2,brazil58.tsp,532454.0,535150.0,709540.0,8.013889,9.045840,0.043413,GILS_cost
3,burma14.tsp,16457.0,16457.0,18393.0,0.156218,0.082072,0.001000,GRASP_time
4,dantzig42.tsp,12392.0,12256.0,14523.0,2.579336,3.325098,0.016044,DBMEA_cost
...,...,...,...,...,...,...,...,...
335,line_90_5.tsp,55730.0,57558.0,157350.0,35.477098,36.326975,0.233857,GILS_cost
336,line_90_6.tsp,48884.0,45916.0,128976.0,30.191444,37.348990,0.232372,DBMEA_cost
337,line_90_7.tsp,51464.0,49038.0,124368.0,28.821121,44.483338,0.232510,DBMEA_cost
338,line_90_8.tsp,60845.0,65825.0,123373.0,28.282006,39.745050,0.231058,GILS_cost


In [14]:
# merge the two dataframes on filename
df2 = pd.merge(df, chars, on='file')
df2

Unnamed: 0,file,GILS_cost,DBMEA_cost,GRASP_cost,GILS_time,DBMEA_time,GRASP_time,min_method,n,m,...,between_var,between_q25,between_q75,closeness_min,closeness_max,closeness_mean,closeness_std,closeness_var,closeness_q25,closeness_q75
0,att48.tsp,226229.0,207771.0,301702.0,6.474685,5.288543,0.026019,DBMEA_cost,48,1176,...,0.000008,0.022051,0.026965,0.000574,0.001325,0.001012,0.000205,4.221718e-08,0.000889,0.001191
1,berlin52.tsp,145419.0,143278.0,191072.0,6.054587,5.374160,0.031533,DBMEA_cost,52,1378,...,0.000065,0.021686,0.032662,0.000919,0.002555,0.001893,0.000507,2.566472e-07,0.001447,0.002371
2,brazil58.tsp,532454.0,535150.0,709540.0,8.013889,9.045840,0.043413,GILS_cost,58,1711,...,0.000591,0.020050,0.042040,0.000201,0.000736,0.000562,0.000145,2.104825e-08,0.000458,0.000692
3,burma14.tsp,16457.0,16457.0,18393.0,0.156218,0.082072,0.001000,GRASP_time,14,105,...,0.000097,0.000000,0.006410,0.001313,0.002677,0.002193,0.000407,1.654824e-07,0.002048,0.002518
4,dantzig42.tsp,12392.0,12256.0,14523.0,2.579336,3.325098,0.016044,DBMEA_cost,42,903,...,0.000788,0.033226,0.062177,0.008369,0.019061,0.014587,0.003118,9.721282e-06,0.011758,0.016824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,line_90_5.tsp,55730.0,57558.0,157350.0,35.477098,36.326975,0.233857,GILS_cost,90,4095,...,0.000435,0.050520,0.072751,0.002192,0.004113,0.003227,0.000642,4.123292e-07,0.002620,0.003881
335,line_90_6.tsp,48884.0,45916.0,128976.0,30.191444,37.348990,0.232372,DBMEA_cost,90,4095,...,0.000730,0.049839,0.071527,0.002220,0.004410,0.003481,0.000709,5.026715e-07,0.002873,0.004060
336,line_90_7.tsp,51464.0,49038.0,124368.0,28.821121,44.483338,0.232510,DBMEA_cost,90,4095,...,0.000393,0.048856,0.069497,0.002238,0.005424,0.004240,0.000992,9.846681e-07,0.003531,0.005157
337,line_90_8.tsp,60845.0,65825.0,123373.0,28.282006,39.745050,0.231058,GILS_cost,90,4095,...,0.000680,0.049190,0.072960,0.002171,0.004137,0.003256,0.000649,4.205608e-07,0.002625,0.003914


In [15]:
df2.min_method.value_counts()

min_method
DBMEA_cost    186
GILS_cost     120
GRASP_time     33
Name: count, dtype: int64

In [16]:
# save df2 to csv
df2.to_csv('matrix_final.csv', index=False)

In [17]:
# train a classifier to predict the min_method


In [18]:
# get the data
df2 = pd.read_csv('matrix_final.csv')
df2

Unnamed: 0,file,GILS_cost,DBMEA_cost,GRASP_cost,GILS_time,DBMEA_time,GRASP_time,min_method,n,m,...,between_var,between_q25,between_q75,closeness_min,closeness_max,closeness_mean,closeness_std,closeness_var,closeness_q25,closeness_q75
0,att48.tsp,226229.0,207771.0,301702.0,6.474685,5.288543,0.026019,DBMEA_cost,48,1176,...,0.000008,0.022051,0.026965,0.000574,0.001325,0.001012,0.000205,4.221718e-08,0.000889,0.001191
1,berlin52.tsp,145419.0,143278.0,191072.0,6.054587,5.374160,0.031533,DBMEA_cost,52,1378,...,0.000065,0.021686,0.032662,0.000919,0.002555,0.001893,0.000507,2.566472e-07,0.001447,0.002371
2,brazil58.tsp,532454.0,535150.0,709540.0,8.013889,9.045840,0.043413,GILS_cost,58,1711,...,0.000591,0.020050,0.042040,0.000201,0.000736,0.000562,0.000145,2.104825e-08,0.000458,0.000692
3,burma14.tsp,16457.0,16457.0,18393.0,0.156218,0.082072,0.001000,GRASP_time,14,105,...,0.000097,0.000000,0.006410,0.001313,0.002677,0.002193,0.000407,1.654824e-07,0.002048,0.002518
4,dantzig42.tsp,12392.0,12256.0,14523.0,2.579336,3.325098,0.016044,DBMEA_cost,42,903,...,0.000788,0.033226,0.062177,0.008369,0.019061,0.014587,0.003118,9.721282e-06,0.011758,0.016824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,line_90_5.tsp,55730.0,57558.0,157350.0,35.477098,36.326975,0.233857,GILS_cost,90,4095,...,0.000435,0.050520,0.072751,0.002192,0.004113,0.003227,0.000642,4.123292e-07,0.002620,0.003881
335,line_90_6.tsp,48884.0,45916.0,128976.0,30.191444,37.348990,0.232372,DBMEA_cost,90,4095,...,0.000730,0.049839,0.071527,0.002220,0.004410,0.003481,0.000709,5.026715e-07,0.002873,0.004060
336,line_90_7.tsp,51464.0,49038.0,124368.0,28.821121,44.483338,0.232510,DBMEA_cost,90,4095,...,0.000393,0.048856,0.069497,0.002238,0.005424,0.004240,0.000992,9.846681e-07,0.003531,0.005157
337,line_90_8.tsp,60845.0,65825.0,123373.0,28.282006,39.745050,0.231058,GILS_cost,90,4095,...,0.000680,0.049190,0.072960,0.002171,0.004137,0.003256,0.000649,4.205608e-07,0.002625,0.003914


In [19]:
# get the features
X = df2.drop(['file', 'min_method', "GILS_cost", "GILS_time", "GRASP_cost", "GRASP_time", "DBMEA_cost", "DBMEA_time"], axis=1)
y = df2['min_method']


In [21]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [22]:
# train the classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)


In [23]:
# get the predictions
y_pred = clf.predict(X_test)


In [24]:
# get the accuracy
accuracy_score(y_test, y_pred)

0.5980392156862745

In [25]:
# get the confusion matrix
confusion_matrix(y_test, y_pred)

array([[41, 21,  2],
       [13, 14,  2],
       [ 1,  2,  6]], dtype=int64)

In [26]:
# get the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  DBMEA_cost       0.75      0.64      0.69        64
   GILS_cost       0.38      0.48      0.42        29
  GRASP_time       0.60      0.67      0.63         9

    accuracy                           0.60       102
   macro avg       0.57      0.60      0.58       102
weighted avg       0.63      0.60      0.61       102



In [31]:
# rank the features by importance
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
# plot the features
plt.barh(feature_importances.index, feature_importances.importance)
plt.show()


Unnamed: 0,importance
between_q25,0.061629
tsp_approx,0.061198
between_mean,0.058622
between_q75,0.057403
avg_clustering,0.049619
density,0.048625
between_max,0.03924
between_std,0.036947
m,0.035831
ratio2,0.034066
