In [1]:
import pandas as pd
import numpy as np
import subprocess
from itertools import chain, product

In [2]:
PARAMS_TO_FLAGS = {
    'dataset': '-i',
    'output_file': '-o',
    'mode': '-m',
    'k': '-k',
    'alpha': '-a',
    'threshold_frecuency_low': '-f_low',
    'threshold_frecuency_high': '-f_high',
}

def build_grid(ranges):
    """Build grid based on ranges."""
    def as_list(elmt):
        """Convert elmt to list if elmt is not list."""
        if isinstance(elmt, list):
            return elmt
        else:
            return [elmt]

    grid = product(*(as_list(i) for i in ranges.values()))
    for p in grid:
        yield (dict(zip(ranges.keys(), p)))

In [3]:
grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0, 1],
    'k': [1, 2, 3, 4, 5, 10, 20, 30, 50],
    'alpha': [0, 1, 5, 10, 15, 20, 25, 30, 50],
    'threshold_frecuency_low': [0.01],
    'threshold_frecuency_high': [0.99],
}

# Un generator(iterador) de los puntos de la grilla
grid = list(build_grid(ranges=grid_ranges))
print(grid[0])
print(grid[1])
print(grid[2])

{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 0, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 1, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 5, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}


In [4]:
def parse_results(output, niters):
    N, time, eigen_val, *eigen_vec = list(map(float, output.decode().split('\n')[:-1]))
    return {
        'N': N,
        'time': time,
        'niters': niters,
        'eigen_val': eigen_val,
        'eigen_vec': np.array(eigen_vec),
    }


def output_to_dict(output):
    # Parsear la salida a un diccionario
    lines = output.decode().split('\n')
    d = dict(tuple(l.split(': ')) for l in lines if ': ' in l)
    for k, v in d.items():
        if k != 'Dataset File':
            d[k] = float(v)
    return d

def run(exc_path, params):
    args = tuple(chain(*((PARAMS_TO_FLAGS[p], str(v)) for p, v in params.items())))
    args = [exc_path, *args]
    output = subprocess.check_output(args)
    return output_to_dict(output)

In [5]:
EXEC_PATH = '../executables/tp2'
param_set = {
    'dataset': '../data/train_test_sample_1000.csv', 
    'output_file': '../../datos/out.csv', 
    'mode': 0, 
    'k': 1, 
    'alpha': 0, 
    'threshold_frecuency_low': 0.01, 
    'threshold_frecuency_high': 0.99
}
out = run(EXEC_PATH, param_set)
out

{'Dataset File': '../data/train_test_sample_1000.csv',
 'mode': 0.0,
 'k': 1.0,
 'alpha': 0.0,
 'threshold_frecuency_low': 0.01,
 'threshold_frecuency_high': 0.99,
 'tp': 713.0,
 'fp': 437.0,
 'tn': 788.0,
 'fn': 562.0,
 'Accuracy': 0.6004,
 'Precision': 0.559216,
 'Recall': 0.62,
 'F1': 0.588041,
 'Time': 11.528}

In [6]:
# Correrlo para una grilla chica para ver que pasa con los thresholds
grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
    'threshold_frecuency_high': [0.8, 0.85, 0.9, 0.95, 0.99],
}
results = []
grid = list(build_grid(grid_ranges))
for i, point in enumerate(grid):
    print('\r', f'Running for point {i+1}/{len(grid)}', end='         ')
    out = run(EXEC_PATH, point)
    results.append(out)
results_df = pd.DataFrame(results)

 Running for point 30/30         

In [7]:
results_df.head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.5792,../data/train_test_sample_1000.csv,0.574089,0.556078,0.593305,4.93782,0.0,566.0,486.0,5.0,0.0,0.8,0.01,739.0,709.0
1,0.5884,../data/train_test_sample_1000.csv,0.583232,0.564706,0.603015,4.97401,0.0,555.0,474.0,5.0,0.0,0.85,0.01,751.0,720.0
2,0.5932,../data/train_test_sample_1000.csv,0.588759,0.57098,0.607679,4.92615,0.0,547.0,470.0,5.0,0.0,0.9,0.01,755.0,728.0
3,0.5896,../data/train_test_sample_1000.csv,0.580196,0.556078,0.606501,4.96373,0.0,566.0,460.0,5.0,0.0,0.95,0.01,765.0,709.0
4,0.6004,../data/train_test_sample_1000.csv,0.588041,0.559216,0.62,4.98864,0.0,562.0,437.0,5.0,0.0,0.99,0.01,788.0,713.0


In [8]:
# results_df.sort_values('Accuracy')

In [9]:
# pareceria ser que los valores de los thresholds en 0.2 y 0.95 son los mejores.
# Igual es un conjunto de datos bastante chico habria que probar con mas datos

In [10]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5],
    'threshold_frecuency_high': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99],
}
grid = list(build_grid(grid_ranges))
res = run_grid_in_parallel(grid)

 Point 63/63 done         

In [11]:
res.sort_values('Accuracy', ascending=False).head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
8,0.6104,../data/train_test_sample_1000.csv,0.592469,0.555294,0.634978,2.26049,0.0,567.0,407.0,5.0,0.0,0.7,0.05,818.0,708.0
4,0.6056,../data/train_test_sample_1000.csv,0.590191,0.556863,0.627763,2.20125,0.0,565.0,421.0,5.0,0.0,0.75,0.05,804.0,710.0
5,0.604,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,2.21171,0.0,550.0,440.0,5.0,0.0,0.99,0.05,785.0,725.0
14,0.604,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,1.06052,0.0,550.0,440.0,5.0,0.0,0.99,0.1,785.0,725.0
1,0.602,../data/train_test_sample_1000.csv,0.587308,0.555294,0.623239,1.04956,0.0,567.0,428.0,5.0,0.0,0.7,0.1,797.0,708.0


In [12]:
def score_metrics(scores, metrics=['Accuracy', 'F1','Precision', 'Recall'], top=5):
    for metric in metrics:
        for t in range(top):
            scores.sort_values(metric, ascending=False).iloc[t]
        
        
    


In [6]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20],
    'alpha': [25, 50, 75, 100, 150],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_2 = run_grid_in_parallel(grid)

 Point 45/45 done         

In [7]:
res_2.sort_values('Precision', ascending=False)

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
22,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,23.3296,75.0,545.0,454.0,7.0,1.0,0.99,0.05,771.0,730.0
38,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,14.6964,75.0,545.0,454.0,20.0,1.0,0.99,0.05,771.0,730.0
26,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,23.0277,75.0,545.0,454.0,10.0,1.0,0.99,0.05,771.0,730.0
30,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,23.5549,75.0,545.0,454.0,9.0,1.0,0.99,0.05,771.0,730.0
12,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,24.1412,75.0,545.0,454.0,5.0,1.0,0.99,0.05,771.0,730.0
37,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,14.9028,75.0,545.0,454.0,15.0,1.0,0.99,0.05,771.0,730.0
11,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,24.0807,75.0,545.0,454.0,1.0,1.0,0.99,0.05,771.0,730.0
5,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,23.8461,75.0,545.0,454.0,2.0,1.0,0.99,0.05,771.0,730.0
3,0.6004,../data/train_test_sample_1000.csv,0.593737,0.572549,0.616554,23.6718,75.0,545.0,454.0,3.0,1.0,0.99,0.05,771.0,730.0
41,0.594,../data/train_test_sample_1000.csv,0.588569,0.569412,0.60906,14.9465,50.0,549.0,466.0,20.0,1.0,0.99,0.05,759.0,726.0


In [10]:
#Experimentamos variando el tamaño del train test
#KNN + PCA
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,1000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [15],
    'alpha': [50],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_3 = run_grid_in_parallel(grid)

 Point 25/25 done         

In [14]:
res_3

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.594,../data/train_test_sample_1000.csv,0.588569,0.569412,0.60906,23.8505,50.0,549.0,466.0,15.0,1.0,0.99,0.05,759.0,726.0
1,0.5976,../data/train_test_sample_2000.csv,0.599522,0.590588,0.608731,29.2015,50.0,522.0,484.0,15.0,1.0,0.99,0.05,741.0,753.0
2,0.6204,../data/train_test_sample_3000.csv,0.594964,0.546667,0.652622,32.2075,50.0,578.0,371.0,15.0,1.0,0.99,0.05,854.0,697.0
3,0.604,../data/train_test_sample_4000.csv,0.592593,0.564706,0.623377,36.2984,50.0,555.0,435.0,15.0,1.0,0.99,0.05,790.0,720.0
4,0.6136,../data/train_test_sample_5000.csv,0.616362,0.608627,0.624296,39.5772,50.0,499.0,467.0,15.0,1.0,0.99,0.05,758.0,776.0
5,0.6056,../data/train_test_sample_6000.csv,0.596893,0.572549,0.623399,42.5151,50.0,545.0,441.0,15.0,1.0,0.99,0.05,784.0,730.0
6,0.6152,../data/train_test_sample_7000.csv,0.607667,0.584314,0.632965,45.5954,50.0,530.0,432.0,15.0,1.0,0.99,0.05,793.0,745.0
7,0.6256,../data/train_test_sample_8000.csv,0.608368,0.570196,0.652018,48.7446,50.0,548.0,388.0,15.0,1.0,0.99,0.05,837.0,727.0
8,0.6208,../data/train_test_sample_9000.csv,0.619888,0.606275,0.634126,51.7279,50.0,502.0,446.0,15.0,1.0,0.99,0.05,779.0,773.0
9,0.6204,../data/train_test_sample_10000.csv,0.613442,0.590588,0.638136,54.5909,50.0,522.0,427.0,15.0,1.0,0.99,0.05,798.0,753.0


In [25]:
res_3.to_csv('KNN_PCA_TrainTestSize.csv')

In [15]:
#Experimentamos variando el tamaño del train test
#KNN
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,1000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [15],
    'alpha': [0],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_4 = run_grid_in_parallel(grid)

 Point 25/25 done         

In [16]:
res_4

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.604,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.01979,0.0,550.0,440.0,15.0,0.0,0.99,0.05,785.0,725.0
1,0.6052,../data/train_test_sample_2000.csv,0.605042,0.592941,0.617647,10.5459,0.0,519.0,468.0,15.0,0.0,0.99,0.05,757.0,756.0
2,0.6252,../data/train_test_sample_3000.csv,0.552745,0.454118,0.706098,15.9254,0.0,696.0,241.0,15.0,0.0,0.99,0.05,984.0,579.0
3,0.6048,../data/train_test_sample_4000.csv,0.59106,0.56,0.625767,21.349,0.0,561.0,427.0,15.0,0.0,0.99,0.05,798.0,714.0
4,0.6116,../data/train_test_sample_5000.csv,0.620851,0.623529,0.618196,26.9702,0.0,480.0,491.0,15.0,0.0,0.99,0.05,734.0,795.0
5,0.6052,../data/train_test_sample_6000.csv,0.584071,0.543529,0.631148,32.2539,0.0,582.0,405.0,15.0,0.0,0.99,0.05,820.0,693.0
6,0.6196,../data/train_test_sample_7000.csv,0.596863,0.552157,0.649446,37.6013,0.0,571.0,380.0,15.0,0.0,0.99,0.05,845.0,704.0
7,0.6256,../data/train_test_sample_8000.csv,0.60473,0.561569,0.655078,43.1002,0.0,559.0,377.0,15.0,0.0,0.99,0.05,848.0,716.0
8,0.6156,../data/train_test_sample_9000.csv,0.600083,0.56549,0.639184,48.4234,0.0,554.0,407.0,15.0,0.0,0.99,0.05,818.0,721.0
9,0.6328,../data/train_test_sample_10000.csv,0.627435,0.606275,0.650126,53.9336,0.0,502.0,416.0,15.0,0.0,0.99,0.05,809.0,773.0


In [26]:
res_4.to_csv('KNN_TrainTestSize.csv')

In [None]:
#Como se relaciona k con el tamaño del training set

In [17]:
#Experimentamos variando el tamaño del train test y k
#KNN + PCA

import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,2000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20],
    'alpha': [50],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_5 = run_grid_in_parallel(grid)

 Point 117/117 done         

In [23]:
res_5

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.0656,50.0,549.0,466.0,9.0,1.0,0.99,0.05,759.0,726.0
1,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.1416,50.0,549.0,466.0,2.0,1.0,0.99,0.05,759.0,726.0
2,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.2897,50.0,549.0,466.0,15.0,1.0,0.99,0.05,759.0,726.0
3,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.4285,50.0,549.0,466.0,20.0,1.0,0.99,0.05,759.0,726.0
4,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.5082,50.0,549.0,466.0,1.0,1.0,0.99,0.05,759.0,726.0
5,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.5243,50.0,549.0,466.0,7.0,1.0,0.99,0.05,759.0,726.0
6,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.5474,50.0,549.0,466.0,5.0,1.0,0.99,0.05,759.0,726.0
7,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.7210,50.0,549.0,466.0,3.0,1.0,0.99,0.05,759.0,726.0
8,0.5940,../data/train_test_sample_1000.csv,0.588569,0.569412,0.609060,23.8369,50.0,549.0,466.0,10.0,1.0,0.99,0.05,759.0,726.0
9,0.6204,../data/train_test_sample_3000.csv,0.594964,0.546667,0.652622,31.0133,50.0,578.0,371.0,9.0,1.0,0.99,0.05,854.0,697.0


In [27]:
res_5.to_csv('KNN_PCA_TrainTestSize_and_K.csv')

In [18]:
#Experimentamos variando el tamaño del train test y k
#KNN

import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,2000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20],
    'alpha': [0],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_6 = run_grid_in_parallel(grid)

 Point 117/117 done         

In [24]:
res_6

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.28457,0.0,550.0,440.0,7.0,0.0,0.99,0.05,785.0,725.0
1,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.28718,0.0,550.0,440.0,9.0,0.0,0.99,0.05,785.0,725.0
2,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.31222,0.0,550.0,440.0,15.0,0.0,0.99,0.05,785.0,725.0
3,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.32196,0.0,550.0,440.0,1.0,0.0,0.99,0.05,785.0,725.0
4,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.31951,0.0,550.0,440.0,20.0,0.0,0.99,0.05,785.0,725.0
5,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.31848,0.0,550.0,440.0,5.0,0.0,0.99,0.05,785.0,725.0
6,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.32747,0.0,550.0,440.0,2.0,0.0,0.99,0.05,785.0,725.0
7,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.36180,0.0,550.0,440.0,10.0,0.0,0.99,0.05,785.0,725.0
8,0.6040,../data/train_test_sample_1000.csv,0.594262,0.568627,0.622318,5.36856,0.0,550.0,440.0,3.0,0.0,0.99,0.05,785.0,725.0
9,0.6252,../data/train_test_sample_3000.csv,0.552745,0.454118,0.706098,15.94720,0.0,696.0,241.0,2.0,0.0,0.99,0.05,984.0,579.0


In [28]:
res_6.to_csv('KNN_TrainTestSize_and_K.csv')