In [34]:
import pandas as pd
import numpy as np
import subprocess
from itertools import chain, product

In [29]:
PARAMS_TO_FLAGS = {
    'dataset': '-i',
    'output_file': '-o',
    'mode': '-m',
    'k': '-k',
    'alpha': '-a',
    'threshold_frecuency_low': '-f_low',
    'threshold_frecuency_high': '-f_high',
}

def build_grid(ranges):
    """Build grid based on ranges."""
    def as_list(elmt):
        """Convert elmt to list if elmt is not list."""
        if isinstance(elmt, list):
            return elmt
        else:
            return [elmt]

    grid = product(*(as_list(i) for i in ranges.values()))
    for p in grid:
        yield (dict(zip(ranges.keys(), p)))

In [30]:
grid_ranges = {
    'dataset': ['../data/train_test_sample.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0, 1],
    'k': [1, 2, 3, 4, 5, 10, 20, 30, 50],
    'alpha': [0, 1, 5, 10, 15, 20, 25, 30, 50],
    'threshold_frecuency_low': [0.01],
    'threshold_frecuency_high': [0.99],
}

# Un generator(iterador) de los puntos de la grilla
grid = list(build_grid(ranges=grid_ranges))
print(grid[0])
print(grid[1])
print(grid[2])

{'dataset': '../data/train_test_sample.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 0, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 1, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 5, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}


In [61]:
def parse_results(output, niters):
    N, time, eigen_val, *eigen_vec = list(map(float, output.decode().split('\n')[:-1]))
    return {
        'N': N,
        'time': time,
        'niters': niters,
        'eigen_val': eigen_val,
        'eigen_vec': np.array(eigen_vec),
    }


def output_to_dict(output):
    # Parsear la salida a un diccionario
    lines = output.decode().split('\n')
    d = dict(tuple(l.split(': ')) for l in lines if ': ' in l)
    for k, v in d.items():
        if k != 'Dataset File':
            d[k] = float(v)
    return d

def run(exc_path, params):
    args = tuple(chain(*((PARAMS_TO_FLAGS[p], str(v)) for p, v in params.items())))
    args = (exc_path, *args)
    proc = subprocess.Popen(args, stdout=subprocess.PIPE)
    proc.wait()
    output = proc.stdout.read()
    return output_to_dict(output)

In [63]:
EXEC_PATH = '../executables/tp2'
param_set = {
    'dataset': '../data/train_test_sample.csv', 
    'output_file': '../../datos/out.csv', 
    'mode': 0, 
    'k': 1, 
    'alpha': 0, 
    'threshold_frecuency_low': 0.01, 
    'threshold_frecuency_high': 0.99
}
out = run(EXEC_PATH, param_set)
out

{'Dataset File': '../data/train_test_sample.csv',
 'mode': 0.0,
 'k': 1.0,
 'alpha': 0.0,
 'threshold_frecuency_low': 0.01,
 'threshold_frecuency_high': 0.99,
 'tp': 63.0,
 'fp': 42.0,
 'tn': 218.0,
 'fn': 177.0,
 'Accuracy': 0.562,
 'Precision': 0.2625,
 'Recall': 0.6,
 'F1': 0.365217}

In [64]:
# Correrlo para una grilla chica para ver que pasa con los thresholds
grid_ranges = {
    'dataset': ['../data/train_test_sample.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
    'threshold_frecuency_high': [0.8, 0.85, 0.9, 0.95, 0.99],
}
results = []
grid = list(build_grid(grid_ranges))
for i, point in enumerate(grid):
    print('\r', f'Running for point {i+1}/{len(grid)}', end='         ')
    out = run(EXEC_PATH, point)
    results.append(out)
results_df = pd.DataFrame(results)

 Running for point 29/30         

In [65]:
results_df.head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.566,../data/train_test_sample.csv,0.323988,0.216667,0.641975,0.0,188.0,29.0,5.0,0.0,0.8,0.01,231.0,52.0
1,0.56,../data/train_test_sample.csv,0.320988,0.216667,0.619048,0.0,188.0,32.0,5.0,0.0,0.85,0.01,228.0,52.0
2,0.57,../data/train_test_sample.csv,0.321767,0.2125,0.662338,0.0,189.0,26.0,5.0,0.0,0.9,0.01,234.0,51.0
3,0.56,../data/train_test_sample.csv,0.31677,0.2125,0.621951,0.0,189.0,31.0,5.0,0.0,0.95,0.01,229.0,51.0
4,0.562,../data/train_test_sample.csv,0.365217,0.2625,0.6,0.0,177.0,42.0,5.0,0.0,0.99,0.01,218.0,63.0


In [66]:
results_df.sort_values('Accuracy')

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
1,0.56,../data/train_test_sample.csv,0.320988,0.216667,0.619048,0.0,188.0,32.0,5.0,0.0,0.85,0.01,228.0,52.0
3,0.56,../data/train_test_sample.csv,0.31677,0.2125,0.621951,0.0,189.0,31.0,5.0,0.0,0.95,0.01,229.0,51.0
4,0.562,../data/train_test_sample.csv,0.365217,0.2625,0.6,0.0,177.0,42.0,5.0,0.0,0.99,0.01,218.0,63.0
0,0.566,../data/train_test_sample.csv,0.323988,0.216667,0.641975,0.0,188.0,29.0,5.0,0.0,0.8,0.01,231.0,52.0
2,0.57,../data/train_test_sample.csv,0.321767,0.2125,0.662338,0.0,189.0,26.0,5.0,0.0,0.9,0.01,234.0,51.0
10,0.578,../data/train_test_sample.csv,0.402266,0.295833,0.628319,0.0,169.0,42.0,5.0,0.0,0.8,0.1,218.0,71.0
6,0.578,../data/train_test_sample.csv,0.395415,0.2875,0.633028,0.0,171.0,40.0,5.0,0.0,0.85,0.05,220.0,69.0
9,0.58,../data/train_test_sample.csv,0.447368,0.354167,0.607143,0.0,155.0,55.0,5.0,0.0,0.99,0.05,205.0,85.0
7,0.58,../data/train_test_sample.csv,0.396552,0.2875,0.638889,0.0,171.0,39.0,5.0,0.0,0.9,0.05,221.0,69.0
21,0.584,../data/train_test_sample.csv,0.425414,0.320833,0.631148,0.0,163.0,45.0,5.0,0.0,0.85,0.2,215.0,77.0


In [67]:
# pareceria ser que los valores de los thresholds en 0.2 y 0.95 son los mejores.
# Igual es un conjunto de datos bastante chico habria que probar con mas datos