In [79]:
import pandas as pd
import numpy as np
import subprocess
from itertools import chain, product

In [80]:
PARAMS_TO_FLAGS = {
    'dataset': '-i',
    'output_file': '-o',
    'mode': '-m',
    'k': '-k',
    'alpha': '-a',
    'threshold_frecuency_low': '-f_low',
    'threshold_frecuency_high': '-f_high',
}

def build_grid(ranges):
    """Build grid based on ranges."""
    def as_list(elmt):
        """Convert elmt to list if elmt is not list."""
        if isinstance(elmt, list):
            return elmt
        else:
            return [elmt]

    grid = product(*(as_list(i) for i in ranges.values()))
    for p in grid:
        yield (dict(zip(ranges.keys(), p)))

In [81]:
grid_ranges = {
    'dataset': ['../data/train_test_sample.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0, 1],
    'k': [1, 2, 3, 4, 5, 10, 20, 30, 50],
    'alpha': [0, 1, 5, 10, 15, 20, 25, 30, 50],
    'threshold_frecuency_low': [0.01],
    'threshold_frecuency_high': [0.99],
}

# Un generator(iterador) de los puntos de la grilla
grid = list(build_grid(ranges=grid_ranges))
print(grid[0])
print(grid[1])
print(grid[2])

{'dataset': '../data/train_test_sample.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 0, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 1, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 5, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}


In [76]:
def parse_results(output, niters):
    N, time, eigen_val, *eigen_vec = list(map(float, output.decode().split('\n')[:-1]))
    return {
        'N': N,
        'time': time,
        'niters': niters,
        'eigen_val': eigen_val,
        'eigen_vec': np.array(eigen_vec),
    }


def output_to_dict(output):
    # Parsear la salida a un diccionario
    lines = output.decode().split('\n')
    d = dict(tuple(l.split(': ')) for l in lines if ': ' in l)
    for k, v in d.items():
        if k != 'Dataset File':
            d[k] = float(v)
    return d

def run(exc_path, params):
    args = tuple(chain(*((PARAMS_TO_FLAGS[p], str(v)) for p, v in params.items())))
    args = [exc_path, *args]
    output = subprocess.check_output(args)
    return output_to_dict(output)

In [77]:
EXEC_PATH = '../executables/tp2'
param_set = {
    'dataset': '../data/train_test_sample.csv', 
    'output_file': '../../datos/out.csv', 
    'mode': 0, 
    'k': 1, 
    'alpha': 0, 
    'threshold_frecuency_low': 0.01, 
    'threshold_frecuency_high': 0.99
}
out = run(EXEC_PATH, param_set)
out

{'Dataset File': '../data/train_test_sample.csv',
 'mode': 0.0,
 'k': 1.0,
 'alpha': 0.0,
 'threshold_frecuency_low': 0.01,
 'threshold_frecuency_high': 0.99,
 'tp': 63.0,
 'fp': 42.0,
 'tn': 218.0,
 'fn': 177.0,
 'Accuracy': 0.562,
 'Precision': 0.2625,
 'Recall': 0.6,
 'F1': 0.365217}

In [64]:
# Correrlo para una grilla chica para ver que pasa con los thresholds
grid_ranges = {
    'dataset': ['../data/train_test_sample.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
    'threshold_frecuency_high': [0.8, 0.85, 0.9, 0.95, 0.99],
}
results = []
grid = list(build_grid(grid_ranges))
for i, point in enumerate(grid):
    print('\r', f'Running for point {i+1}/{len(grid)}', end='         ')
    out = run(EXEC_PATH, point)
    results.append(out)
results_df = pd.DataFrame(results)

 Running for point 29/30         

In [65]:
results_df.head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.566,../data/train_test_sample.csv,0.323988,0.216667,0.641975,0.0,188.0,29.0,5.0,0.0,0.8,0.01,231.0,52.0
1,0.56,../data/train_test_sample.csv,0.320988,0.216667,0.619048,0.0,188.0,32.0,5.0,0.0,0.85,0.01,228.0,52.0
2,0.57,../data/train_test_sample.csv,0.321767,0.2125,0.662338,0.0,189.0,26.0,5.0,0.0,0.9,0.01,234.0,51.0
3,0.56,../data/train_test_sample.csv,0.31677,0.2125,0.621951,0.0,189.0,31.0,5.0,0.0,0.95,0.01,229.0,51.0
4,0.562,../data/train_test_sample.csv,0.365217,0.2625,0.6,0.0,177.0,42.0,5.0,0.0,0.99,0.01,218.0,63.0


In [82]:
# results_df.sort_values('Accuracy')

In [67]:
# pareceria ser que los valores de los thresholds en 0.2 y 0.95 son los mejores.
# Igual es un conjunto de datos bastante chico habria que probar con mas datos

In [86]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=4):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_val_sample.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5],
    'threshold_frecuency_high': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99],
}
grid = list(build_grid(grid_ranges))
res = run_grid_in_parallel(grid)

 Point 63/63 done         

In [95]:
res.sort_values('Accuracy', ascending=False).iloc[0]['']

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
34,0.636,../data/train_val_sample.csv,0.538071,0.441667,0.688312,0.0,134.0,48.0,5.0,0.0,0.95,0.2,212.0,106.0
48,0.618,../data/train_val_sample.csv,0.544153,0.475,0.636872,0.0,126.0,65.0,5.0,0.0,0.99,0.3,195.0,114.0
27,0.616,../data/train_val_sample.csv,0.529412,0.45,0.642857,0.0,132.0,60.0,5.0,0.0,0.99,0.15,200.0,108.0
4,0.616,../data/train_val_sample.csv,0.463687,0.345833,0.70339,0.0,157.0,35.0,5.0,0.0,0.7,0.05,225.0,83.0
25,0.616,../data/train_val_sample.csv,0.492063,0.3875,0.673913,0.0,147.0,45.0,5.0,0.0,0.95,0.15,215.0,93.0


In [None]:
def score_metrics(scores, metrics=['Accuracy', 'F1','Precision', 'Recall'], top=5):
    for metric in metrics:
        for t in range(top):
            scores.sort_values(metric, ascending=False).iloc[t]
        
        
    


In [102]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=4):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_val_sample.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20],
    'alpha': [25, 50, 75, 100, 150],
    'threshold_frecuency_low': [0.01],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res = run_grid_in_parallel(grid)

Process ForkPoolWorker-43:
Process ForkPoolWorker-42:
Process ForkPoolWorker-44:
Process ForkPoolWorker-41:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):


KeyboardInterrupt: 

In [101]:
res.sort_values('Precision', ascending=False)

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
30,0.616,../data/train_val_sample.csv,0.580786,0.554167,0.610092,150.0,107.0,85.0,9.0,1.0,0.95,0.2,175.0,133.0
0,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,1.0,1.0,0.95,0.2,195.0,121.0
8,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,3.0,1.0,0.95,0.2,195.0,121.0
20,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,7.0,1.0,0.95,0.2,195.0,121.0
14,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,5.0,1.0,0.95,0.2,195.0,121.0
40,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,20.0,1.0,0.95,0.2,195.0,121.0
24,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,9.0,1.0,0.95,0.2,195.0,121.0
34,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,15.0,1.0,0.95,0.2,195.0,121.0
4,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,2.0,1.0,0.95,0.2,195.0,121.0
29,0.632,../data/train_val_sample.csv,0.568075,0.504167,0.650538,25.0,119.0,65.0,10.0,1.0,0.95,0.2,195.0,121.0
