In [1]:
import pandas as pd
import numpy as np
import subprocess
from itertools import chain, product

In [2]:
PARAMS_TO_FLAGS = {
    'dataset': '-i',
    'output_file': '-o',
    'mode': '-m',
    'k': '-k',
    'alpha': '-a',
    'threshold_frecuency_low': '-f_low',
    'threshold_frecuency_high': '-f_high',
}

def build_grid(ranges):
    """Build grid based on ranges."""
    def as_list(elmt):
        """Convert elmt to list if elmt is not list."""
        if isinstance(elmt, list):
            return elmt
        else:
            return [elmt]

    grid = product(*(as_list(i) for i in ranges.values()))
    for p in grid:
        yield (dict(zip(ranges.keys(), p)))

In [3]:
grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0, 1],
    'k': [1, 2, 3, 4, 5, 10, 20, 30, 50],
    'alpha': [0, 1, 5, 10, 15, 20, 25, 30, 50],
    'threshold_frecuency_low': [0.01],
    'threshold_frecuency_high': [0.99],
}

# Un generator(iterador) de los puntos de la grilla
grid = list(build_grid(ranges=grid_ranges))
print(grid[0])
print(grid[1])
print(grid[2])

{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 0, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 1, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 5, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}


In [4]:
def parse_results(output, niters):
    N, time, eigen_val, *eigen_vec = list(map(float, output.decode().split('\n')[:-1]))
    return {
        'N': N,
        'time': time,
        'niters': niters,
        'eigen_val': eigen_val,
        'eigen_vec': np.array(eigen_vec),
    }


def output_to_dict(output):
    # Parsear la salida a un diccionario
    lines = output.decode().split('\n')
    d = dict(tuple(l.split(': ')) for l in lines if ': ' in l)
    for k, v in d.items():
        if k != 'Dataset File':
            d[k] = float(v)
    return d

def run(exc_path, params):
    args = tuple(chain(*((PARAMS_TO_FLAGS[p], str(v)) for p, v in params.items())))
    args = [exc_path, *args]
    output = subprocess.check_output(args)
    return output_to_dict(output)

In [20]:
EXEC_PATH = '../executables/tp2'
param_set = {
    'dataset': '../data/train_test_sample_1000.csv', 
    'output_file': '../../datos/out.csv', 
    'mode': 0, 
    'k': 1, 
    'alpha': 0, 
    'threshold_frecuency_low': 0.01, 
    'threshold_frecuency_high': 0.99
}
out = run(EXEC_PATH, param_set)
out

{'Dataset File': '../data/train_test_sample_1000.csv',
 'mode': 0.0,
 'k': 1.0,
 'alpha': 0.0,
 'threshold_frecuency_low': 0.01,
 'threshold_frecuency_high': 0.99,
 'tp': 557.0,
 'fp': 400.0,
 'tn': 837.0,
 'fn': 706.0,
 'Accuracy': 0.5576,
 'Precision': 0.441013,
 'Recall': 0.582027,
 'F1': 0.501802,
 'Time': 4.43996}

In [21]:
# Correrlo para una grilla chica para ver que pasa con los thresholds
grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
    'threshold_frecuency_high': [0.8, 0.85, 0.9, 0.95, 0.99],
}
results = []
grid = list(build_grid(grid_ranges))
for i, point in enumerate(grid):
    print('\r', f'Running for point {i+1}/{len(grid)}', end='         ')
    out = run(EXEC_PATH, point)
    results.append(out)
results_df = pd.DataFrame(results)

 Running for point 30/30         

In [22]:
results_df.head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.574,../data/train_test_sample_1000.csv,0.492616,0.409343,0.618421,4.41691,0.0,746.0,319.0,5.0,0.0,0.8,0.01,918.0,517.0
1,0.5704,../data/train_test_sample_1000.csv,0.487595,0.404592,0.613445,4.42123,0.0,752.0,322.0,5.0,0.0,0.85,0.01,915.0,511.0
2,0.5692,../data/train_test_sample_1000.csv,0.488361,0.406968,0.610451,4.41246,0.0,749.0,328.0,5.0,0.0,0.9,0.01,909.0,514.0
3,0.57,../data/train_test_sample_1000.csv,0.477902,0.389549,0.61809,4.38617,0.0,771.0,304.0,5.0,0.0,0.95,0.01,933.0,492.0
4,0.5816,../data/train_test_sample_1000.csv,0.503795,0.420428,0.628402,4.37539,0.0,732.0,314.0,5.0,0.0,0.99,0.01,923.0,531.0


In [8]:
# results_df.sort_values('Accuracy')

In [9]:
# pareceria ser que los valores de los thresholds en 0.2 y 0.95 son los mejores.
# Igual es un conjunto de datos bastante chico habria que probar con mas datos

In [23]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5],
    'threshold_frecuency_high': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99],
}
grid = list(build_grid(grid_ranges))
res = run_grid_in_parallel(grid)

 Point 63/63 done         

In [24]:
res.sort_values('Accuracy', ascending=False).head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
30,0.5872,../data/train_test_sample_1000.csv,0.530055,0.460808,0.623794,0.50937,0.0,681.0,351.0,5.0,0.0,0.99,0.2,886.0,582.0
13,0.5856,../data/train_test_sample_1000.csv,0.52302,0.449723,0.624862,0.844421,0.0,695.0,341.0,5.0,0.0,0.99,0.1,896.0,568.0
20,0.5844,../data/train_test_sample_1000.csv,0.525354,0.455265,0.62095,0.679705,0.0,688.0,351.0,5.0,0.0,0.99,0.15,886.0,575.0
8,0.5844,../data/train_test_sample_1000.csv,0.521419,0.448139,0.623348,1.70399,0.0,697.0,342.0,5.0,0.0,0.99,0.05,895.0,566.0
59,0.5816,../data/train_test_sample_1000.csv,0.503795,0.420428,0.628402,6.06411,0.0,732.0,314.0,5.0,0.0,0.99,0.01,923.0,531.0


In [25]:
def score_metrics(scores, metrics=['Accuracy', 'F1','Precision', 'Recall'], top=5):
    for metric in metrics:
        for t in range(top):
            scores.sort_values(metric, ascending=False).iloc[t]
        
        
    


In [18]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20],
    'alpha': [25, 50, 75, 100, 150],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_2 = run_grid_in_parallel(grid)

 Point 45/45 done         

In [19]:
res_2.sort_values('Precision', ascending=False)

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
11,0.57,../data/train_test_sample_1000.csv,0.539615,0.498812,0.587687,9.90495,100.0,633.0,442.0,1.0,1.0,0.99,0.05,795.0,630.0
0,0.5464,../data/train_test_sample_1000.csv,0.524329,0.494854,0.557538,9.65571,25.0,638.0,496.0,1.0,1.0,0.99,0.05,741.0,625.0
7,0.568,../data/train_test_sample_1000.csv,0.534884,0.491686,0.586402,9.80549,75.0,642.0,438.0,1.0,1.0,0.99,0.05,799.0,621.0
15,0.5652,../data/train_test_sample_1000.csv,0.530859,0.486936,0.583491,10.0974,150.0,648.0,439.0,1.0,1.0,0.99,0.05,798.0,615.0
4,0.5584,../data/train_test_sample_1000.csv,0.526587,0.486144,0.574369,9.7764,50.0,649.0,455.0,1.0,1.0,0.99,0.05,782.0,614.0
8,0.58,../data/train_test_sample_1000.csv,0.535398,0.479018,0.60682,9.82377,75.0,658.0,392.0,3.0,1.0,0.99,0.05,845.0,605.0
3,0.5732,../data/train_test_sample_1000.csv,0.530163,0.476643,0.597222,9.75154,50.0,661.0,406.0,3.0,1.0,0.99,0.05,831.0,602.0
10,0.5904,../data/train_test_sample_1000.csv,0.537906,0.471892,0.625393,9.83525,75.0,667.0,357.0,5.0,1.0,0.99,0.05,880.0,596.0
12,0.5708,../data/train_test_sample_1000.csv,0.524169,0.467933,0.595766,9.92552,100.0,672.0,401.0,3.0,1.0,0.99,0.05,836.0,591.0
1,0.5516,../data/train_test_sample_1000.csv,0.513244,0.467933,0.568269,9.66071,25.0,672.0,449.0,3.0,1.0,0.99,0.05,788.0,591.0


In [6]:
#Experimentamos variando el tamaño del train test
#KNN + PCA
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,1000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [15],
    'alpha': [50],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_3 = run_grid_in_parallel(grid)

 Point 25/25 done         

In [7]:
res_3

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.5944,../data/train_test_sample_1000.csv,0.5125,0.422011,0.652387,9.78971,50.0,730.0,284.0,15.0,1.0,0.99,0.05,953.0,533.0
1,0.6156,../data/train_test_sample_2000.csv,0.549883,0.46885,0.664779,11.289,50.0,665.0,296.0,15.0,1.0,0.99,0.05,952.0,587.0
2,0.6344,../data/train_test_sample_3000.csv,0.57919,0.505627,0.677802,13.2235,50.0,615.0,299.0,15.0,1.0,0.99,0.05,957.0,629.0
3,0.6344,../data/train_test_sample_4000.csv,0.591234,0.546281,0.64425,15.1268,50.0,549.0,365.0,15.0,1.0,0.99,0.05,925.0,661.0
4,0.6232,../data/train_test_sample_5000.csv,0.590791,0.544872,0.645161,16.5521,50.0,568.0,374.0,15.0,1.0,0.99,0.05,878.0,680.0
5,0.6364,../data/train_test_sample_6000.csv,0.60323,0.541536,0.680788,18.1901,50.0,585.0,324.0,15.0,1.0,0.99,0.05,900.0,691.0
6,0.6548,../data/train_test_sample_7000.csv,0.636019,0.58314,0.699443,19.4085,50.0,539.0,324.0,15.0,1.0,0.99,0.05,883.0,754.0
7,0.634,../data/train_test_sample_8000.csv,0.606113,0.583264,0.630824,20.7922,50.0,503.0,412.0,15.0,1.0,0.99,0.05,881.0,704.0
8,0.6444,../data/train_test_sample_9000.csv,0.617634,0.582792,0.656908,21.7368,50.0,514.0,375.0,15.0,1.0,0.99,0.05,893.0,718.0
9,0.6272,../data/train_test_sample_10000.csv,0.60205,0.555118,0.657649,22.8624,50.0,565.0,367.0,15.0,1.0,0.99,0.05,863.0,705.0


In [8]:
res_3.to_csv('KNN_PCA_TrainTestSize.csv')

In [9]:
#Experimentamos variando el tamaño del train test
#KNN
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,1000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [15],
    'alpha': [0],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_4 = run_grid_in_parallel(grid)

 Point 25/25 done         

In [10]:
res_4

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.5848,../data/train_test_sample_1000.csv,0.495136,0.403009,0.641866,1.63204,0.0,754.0,284.0,15.0,0.0,0.99,0.05,953.0,509.0
1,0.6052,../data/train_test_sample_2000.csv,0.511144,0.412141,0.672751,3.87094,0.0,736.0,251.0,15.0,0.0,0.99,0.05,997.0,516.0
2,0.6396,../data/train_test_sample_3000.csv,0.583063,0.506431,0.687023,6.20983,0.0,614.0,287.0,15.0,0.0,0.99,0.05,969.0,630.0
3,0.6356,../data/train_test_sample_4000.csv,0.574895,0.509091,0.660236,8.40499,0.0,594.0,317.0,15.0,0.0,0.99,0.05,973.0,616.0
4,0.6312,../data/train_test_sample_5000.csv,0.588393,0.528045,0.664315,10.6727,0.0,589.0,333.0,15.0,0.0,0.99,0.05,919.0,659.0
5,0.6112,../data/train_test_sample_6000.csv,0.586383,0.539969,0.641527,12.7648,0.0,587.0,385.0,15.0,0.0,0.99,0.05,839.0,689.0
6,0.648,../data/train_test_sample_7000.csv,0.626486,0.570766,0.694262,14.9122,0.0,555.0,325.0,15.0,0.0,0.99,0.05,882.0,738.0
7,0.644,../data/train_test_sample_8000.csv,0.607237,0.570008,0.649669,17.2733,0.0,519.0,371.0,15.0,0.0,0.99,0.05,922.0,688.0
8,0.6412,../data/train_test_sample_9000.csv,0.624529,0.605519,0.644771,19.3317,0.0,486.0,411.0,15.0,0.0,0.99,0.05,857.0,746.0
9,0.6296,../data/train_test_sample_10000.csv,0.603935,0.555906,0.661049,21.3494,0.0,564.0,362.0,15.0,0.0,0.99,0.05,868.0,706.0


In [11]:
res_4.to_csv('KNN_TrainTestSize.csv')

In [None]:
#Como se relaciona k con el tamaño del training set

In [26]:
#Experimentamos variando el tamaño del train test y k
#KNN + PCA

import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,2000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20, 50, 100, 250, 500],
    'alpha': [50],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_5 = run_grid_in_parallel(grid)

 Point 169/169 done         

In [27]:
res_5

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.6024,../data/train_test_sample_1000.csv,0.469584,0.348377,0.720131,9.67878,50.0,823.0,171.0,50.0,1.0,0.99,0.05,1066.0,440.0
1,0.5552,../data/train_test_sample_1000.csv,0.375982,0.265241,0.645472,9.69777,50.0,928.0,184.0,2.0,1.0,0.99,0.05,1053.0,335.0
2,0.5944,../data/train_test_sample_1000.csv,0.512500,0.422011,0.652387,9.69876,50.0,730.0,284.0,15.0,1.0,0.99,0.05,953.0,533.0
3,0.5732,../data/train_test_sample_1000.csv,0.440482,0.332542,0.652174,9.69159,50.0,843.0,224.0,10.0,1.0,0.99,0.05,1013.0,420.0
4,0.5704,../data/train_test_sample_1000.csv,0.512704,0.447348,0.600425,9.71188,50.0,698.0,376.0,5.0,1.0,0.99,0.05,861.0,565.0
5,0.5584,../data/train_test_sample_1000.csv,0.526587,0.486144,0.574369,9.72201,50.0,649.0,455.0,1.0,1.0,0.99,0.05,782.0,614.0
6,0.5568,../data/train_test_sample_1000.csv,0.306633,0.193983,0.731343,9.72193,50.0,1018.0,90.0,500.0,1.0,0.99,0.05,1147.0,245.0
7,0.5772,../data/train_test_sample_1000.csv,0.506305,0.429137,0.617312,9.72454,50.0,721.0,336.0,9.0,1.0,0.99,0.05,901.0,542.0
8,0.5800,../data/train_test_sample_1000.csv,0.371257,0.245447,0.761671,9.72841,50.0,953.0,97.0,250.0,1.0,0.99,0.05,1140.0,310.0
9,0.5876,../data/train_test_sample_1000.csv,0.462180,0.350752,0.677370,9.72138,50.0,820.0,211.0,20.0,1.0,0.99,0.05,1026.0,443.0


In [28]:
res_5.to_csv('KNN_PCA_TrainTestSize_and_K.csv')

In [29]:
#Experimentamos variando el tamaño del train test y k
#KNN

import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,2000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20, 50, 100, 250, 500],
    'alpha': [0],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_6 = run_grid_in_parallel(grid)

 Point 169/169 done         

In [30]:
res_6

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.6096,../data/train_test_sample_1000.csv,0.455357,0.323040,0.771267,1.78221,0.0,855.0,121.0,100.0,0.0,0.99,0.05,1116.0,408.0
1,0.5864,../data/train_test_sample_1000.csv,0.451750,0.337292,0.683788,1.78067,0.0,837.0,197.0,20.0,0.0,0.99,0.05,1040.0,426.0
2,0.5844,../data/train_test_sample_1000.csv,0.521419,0.448139,0.623348,1.80168,0.0,697.0,342.0,5.0,0.0,0.99,0.05,895.0,566.0
3,0.5920,../data/train_test_sample_1000.csv,0.511962,0.423595,0.646917,1.81217,0.0,728.0,292.0,9.0,0.0,0.99,0.05,945.0,535.0
4,0.5544,../data/train_test_sample_1000.csv,0.356069,0.243864,0.659529,1.81915,0.0,955.0,159.0,2.0,0.0,0.99,0.05,1078.0,308.0
5,0.5768,../data/train_test_sample_1000.csv,0.519091,0.452098,0.609392,1.83706,0.0,692.0,366.0,3.0,0.0,0.99,0.05,871.0,571.0
6,0.5632,../data/train_test_sample_1000.csv,0.521053,0.470309,0.584071,1.83777,0.0,669.0,423.0,1.0,0.0,0.99,0.05,814.0,594.0
7,0.5828,../data/train_test_sample_1000.csv,0.507787,0.425970,0.628505,1.83631,0.0,725.0,318.0,7.0,0.0,0.99,0.05,919.0,538.0
8,0.5848,../data/train_test_sample_1000.csv,0.495136,0.403009,0.641866,1.84645,0.0,754.0,284.0,15.0,0.0,0.99,0.05,953.0,509.0
9,0.5748,../data/train_test_sample_1000.csv,0.348253,0.224861,0.771739,1.85083,0.0,979.0,84.0,250.0,0.0,0.99,0.05,1153.0,284.0


In [31]:
res_6.to_csv('KNN_TrainTestSize_and_K.csv')