In [1]:
# --- import packages ---
import sys 
sys.path.append("../..") 
from source import utils,env,main,test
import numpy as np
import pandas as pd
import os
import gc
from ucimlrepo import fetch_ucirepo 
from sklearn.preprocessing import MinMaxScaler
import time
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt

### Read data

In [2]:
# retrieve absolute path for 'real-dataset' folder
abspath = os.path.dirname(os.getcwd()) + '\\real-dataset\\'

diabete_flag = False
micromass_flag = False
crime_flag = True  # missing value, filled with 0
UJIndoor_flag = False
parkinsons_flag = False
cnae_flag = False

# set true if perform normalization on features, otherwise set false
normalize = True

if diabete_flag:
    # Diabete
    read_X = pd.read_csv(abspath + 'Diabete\\' + 'data64.txt', sep = '\t')
    read_Y = pd.read_csv(abspath + 'Diabete\\' + 'original_data.txt', sep = '\t')

    X = read_X.to_numpy()
    Y = read_Y.to_numpy()[:,-1]
    
    # centerize the response (based on the requirements of using diabetes dataset from LARS)
    # https://hastie.su.domains/Papers/LARS/
    Y = Y - np.mean(Y)

elif micromass_flag:
    
    # micromass
    read_X = pd.read_csv(abspath +'micromass\\' + 'pure_spectra_matrix.csv',
                    sep=';',           # Semicolon-separated fields
                    header=None,       # No header row
                    decimal='.')       # Decimal point (not comma)
    read_Y = pd.read_csv(abspath +'micromass\\' + 'pure_spectra_metadata.csv', 
                          sep=';')     # Semicolon-separated fields

    X = read_X.to_numpy()
    Y = read_Y.to_numpy()[:,-1]


elif crime_flag:
    # crime
    # fetch dataset 
    communities_and_crime = fetch_ucirepo(id=183) 

    # data (as pandas dataframes) 
    read_X = communities_and_crime.data.features 
    read_Y = communities_and_crime.data.targets 

    # missing values filled with 0 by default
    read_X = read_X.replace({'?':0})
    read_X = read_X.drop('communityname', axis=1)

    X = read_X.to_numpy()
    X = np.asarray(X, dtype=np.float64)

    Y = read_Y.to_numpy().flatten()
    Y = np.asarray(Y, dtype=np.float64)

elif UJIndoor_flag:
    read_X = pd.read_csv(abspath +'UJIndoorLoc\\' + 'trainingData.csv')

    X = read_X.iloc[:,:520].to_numpy()  # 520 WAP features
    
    Y = read_X['FLOOR'].to_numpy()  # classification target
    # Y = read_X['LONGITUDE'].to_numpy()  # regression target


elif parkinsons_flag:

    read_X = pd.read_csv(abspath + 'Parkinsons\\' + 'pd_speech_features.csv',sep = ',')

    X = read_X.iloc[1:,1:754].to_numpy().astype('float')  
    Y = read_X.iloc[1:,-1].to_numpy().astype('float') 


elif cnae_flag:
    read_X = pd.read_csv(abspath + 'cnae+9\\' + 'CNAE-9.data',header = None)

    X = read_X.to_numpy()[:,1:]
    
    Y = read_X.to_numpy()[:,0]


# normalize the features
if normalize:
    for column in range(X.shape[1]):
        if np.linalg.norm(X[:,column],2) == 0:
            pass
        else:
            X[:,column] = X[:,column] / np.linalg.norm(X[:,column],2)

### Formal test

In [3]:
# Fix the general setting
test_settings = {
    "time_lim":15 * 60,"ite_lim":None,"gap_lim":1e-2,
    "sparsity":10,
    "inc_max_num":10,"inc_max_len":2,
    "exc_max_num":X.shape[1] - 10,"exc_max_len": 3, 
    "solver": 'gurobi'}

# Generate gamma set
gamma_i_set = [1]
gamma_set = [(( 1 / env.generate_gamma(X,test_settings['sparsity'],i) ) / X.shape[0])  for i in gamma_i_set]

In [5]:
# record attributes
all_att = []

# record sol time 
all_data_SSR,all_data_SCG = [],[]

# record finding cuts time
all_find_cuts_SSR,all_find_cuts_SCG = [],[]

# record cuts
all_support_SSR,all_zero_SSR = [],[]
all_inc_SCG, all_exc_SCG = [],[]

for index in range(len(gamma_set)):
    print('=====')
    gamma_ = gamma_set[index]
    time_log, cut_log, att_log, = test.real_ins_test(X = X, Y = Y,progress_log=True,test_alg=['SCG'], \
                                                         gamma_ = gamma_, **test_settings)
    all_data_SSR.append(time_log['SSR'])
    all_data_SCG.append(time_log['SCG'])

    all_att.append(att_log)

    all_find_cuts_SSR.append(time_log['SSR find cuts'])
    all_find_cuts_SCG.append(time_log['SCG find cuts'])

    all_support_SSR.append(cut_log['SSR_support'])
    all_zero_SSR.append(cut_log['SSR_zero'])

    all_inc_SCG.append(cut_log['SCG_inclusive'])
    all_exc_SCG.append(cut_log['SCG_exclusive'])
    print('One gamma done', gamma_set[index])

### Plot

In [7]:
# Average total running time
# Plot settings
sns.set_style('whitegrid')
font1 = {'weight' : 'bold', 
         'size'   : 25}
fig = plt.figure(figsize=(15,10))

# Draw the line
line_SSR, = plt.plot(gamma_set,all_data_SSR,linestyle="-",color="red",marker="o",linewidth=3.5,markersize = 13)
line_SCG, = plt.plot(gamma_set,all_data_SCG,linestyle="-",color="blue",marker="v",linewidth=3.5,markersize = 13)


ax = plt.gca() 
ax.spines['top'].set_color('black')  
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_linewidth(2)
ax.spines['top'].set_linewidth(2)
ax.spines['right'].set_linewidth(2)

plt.xticks(fontsize=30,fontweight = 'bold')
plt.yticks(fontsize=30,fontweight = 'bold')
plt.xlabel('gamma',fontsize=35,fontweight = 'bold')
plt.ylabel('time(s)',fontsize=35,labelpad = 20,fontweight = 'bold')
plt.legend([line_SSR,line_SCG],['SSR','SCG'],loc = 'upper right',prop = font1)

In [9]:
# relative time gap
relative_time_gap = []

for i in range(len(gamma_set)):
    SSR_time = all_data_SSR[i]
    SCG_time = all_data_SCG[i]
    relative_time_gap.append((SSR_time - SCG_time) / SSR_time)

# Plot settings
sns.set_style('whitegrid')
font1 = {'weight' : 'bold', 
         'size'   : 25}
fig = plt.figure(figsize=(15,10))

# Draw Average 
line_avg_gap, = plt.plot(gamma_set,relative_time_gap,linestyle="-",color="red",marker="o",linewidth=3.5,markersize = 13)

ax = plt.gca() 
ax.spines['top'].set_color('black')  
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_linewidth(2)
ax.spines['top'].set_linewidth(2)
ax.spines['right'].set_linewidth(2)

plt.xticks(fontsize=30,fontweight = 'bold')
plt.yticks(fontsize=30,fontweight = 'bold')
plt.xlabel('gamma',fontsize=35,fontweight = 'bold')
plt.ylabel('relative time gap',fontsize=35,labelpad = 20,fontweight = 'bold')

In [11]:
# get the relative objective gap
obj_gap_list = []
for i in range(len(gamma_set)):
    obj_gap_list.append((all_att[i]['SSR obj'] - all_att[i]['SCG obj']) / all_att[i]['SSR obj'])

# Plot settings
sns.set_style('whitegrid')
font1 = {'weight' : 'bold', 
         'size'   : 25}
fig = plt.figure(figsize=(15,10))

# Draw Average 
line_avg_gap, = plt.plot(gamma_set,obj_gap_list,linestyle="-",color="red",marker="o",linewidth=3.5,markersize = 13)

ax = plt.gca() 
ax.spines['top'].set_color('black')  
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_linewidth(2)
ax.spines['top'].set_linewidth(2)
ax.spines['right'].set_linewidth(2)

plt.xticks(fontsize=30,fontweight = 'bold')
plt.yticks(fontsize=30,fontweight = 'bold')
plt.xlabel('gamma',fontsize=35,fontweight = 'bold')
plt.ylabel('relative objective gap',fontsize=35,labelpad = 20,fontweight = 'bold')

In [13]:
# get the MIPGap difference
ratio_list = []
for i in range(len(gamma_set)):
    ratio_list.append(all_att[i]['SSR MIPGap'] - all_att[i]['SCG MIPGap'])

# Plot settings
sns.set_style('whitegrid')
font1 = {'weight' : 'bold', 
         'size'   : 25}
fig = plt.figure(figsize=(15,10))

# Draw Average 
line_avg_gap, = plt.plot(gamma_set,ratio_list,linestyle="-",color="red",marker="o",linewidth=3.5,markersize = 13)

ax = plt.gca() 
ax.spines['top'].set_color('black')  
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_linewidth(2)
ax.spines['top'].set_linewidth(2)
ax.spines['right'].set_linewidth(2)

plt.xticks(fontsize=30,fontweight = 'bold')
plt.yticks(fontsize=30,fontweight = 'bold')
plt.xlabel('gamma',fontsize=35,fontweight = 'bold')
plt.ylabel('MIPGap difference',fontsize=35,labelpad = 20,fontweight = 'bold')