In [1]:
import pandas as pd
import numpy as np
import subprocess
from itertools import chain, product

In [2]:
PARAMS_TO_FLAGS = {
    'dataset': '-i',
    'output_file': '-o',
    'mode': '-m',
    'k': '-k',
    'alpha': '-a',
    'threshold_frecuency_low': '-f_low',
    'threshold_frecuency_high': '-f_high',
}

def build_grid(ranges):
    """Build grid based on ranges."""
    def as_list(elmt):
        """Convert elmt to list if elmt is not list."""
        if isinstance(elmt, list):
            return elmt
        else:
            return [elmt]

    grid = product(*(as_list(i) for i in ranges.values()))
    for p in grid:
        yield (dict(zip(ranges.keys(), p)))

In [3]:
grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0, 1],
    'k': [1, 2, 3, 4, 5, 10, 20, 30, 50],
    'alpha': [0, 1, 5, 10, 15, 20, 25, 30, 50],
    'threshold_frecuency_low': [0.01],
    'threshold_frecuency_high': [0.99],
}

# Un generator(iterador) de los puntos de la grilla
grid = list(build_grid(ranges=grid_ranges))
print(grid[0])
print(grid[1])
print(grid[2])

{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 0, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 1, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}
{'dataset': '../data/train_test_sample_1000.csv', 'output_file': '../../datos/out.csv', 'mode': 0, 'k': 1, 'alpha': 5, 'threshold_frecuency_low': 0.01, 'threshold_frecuency_high': 0.99}


In [4]:
def parse_results(output, niters):
    N, time, eigen_val, *eigen_vec = list(map(float, output.decode().split('\n')[:-1]))
    return {
        'N': N,
        'time': time,
        'niters': niters,
        'eigen_val': eigen_val,
        'eigen_vec': np.array(eigen_vec),
    }


def output_to_dict(output):
    # Parsear la salida a un diccionario
    lines = output.decode().split('\n')
    d = dict(tuple(l.split(': ')) for l in lines if ': ' in l)
    for k, v in d.items():
        if k != 'Dataset File':
            d[k] = float(v)
    return d

def run(exc_path, params):
    args = tuple(chain(*((PARAMS_TO_FLAGS[p], str(v)) for p, v in params.items())))
    args = [exc_path, *args]
    output = subprocess.check_output(args)
    return output_to_dict(output)

In [5]:
EXEC_PATH = '../executables/tp2'
param_set = {
    'dataset': '../data/train_test_sample_1000.csv', 
    'output_file': '../../datos/out.csv', 
    'mode': 0, 
    'k': 1, 
    'alpha': 0, 
    'threshold_frecuency_low': 0.01, 
    'threshold_frecuency_high': 0.99
}
out = run(EXEC_PATH, param_set)
out

{'Dataset File': '../data/train_test_sample_1000.csv',
 'mode': 0.0,
 'k': 1.0,
 'alpha': 0.0,
 'threshold_frecuency_low': 0.01,
 'threshold_frecuency_high': 0.99,
 'tp': 327.0,
 'fp': 228.0,
 'tn': 537.0,
 'fn': 408.0,
 'Accuracy': 0.576,
 'Precision': 0.444898,
 'Recall': 0.589189,
 'F1': 0.506977,
 'Time': 2.7115}

In [21]:
# Correrlo para una grilla chica para ver que pasa con los thresholds
grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
    'threshold_frecuency_high': [0.8, 0.85, 0.9, 0.95, 0.99],
}
results = []
grid = list(build_grid(grid_ranges))
for i, point in enumerate(grid):
    print('\r', f'Running for point {i+1}/{len(grid)}', end='         ')
    out = run(EXEC_PATH, point)
    results.append(out)
results_df = pd.DataFrame(results)

 Running for point 30/30         

In [22]:
results_df.head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.574,../data/train_test_sample_1000.csv,0.492616,0.409343,0.618421,4.41691,0.0,746.0,319.0,5.0,0.0,0.8,0.01,918.0,517.0
1,0.5704,../data/train_test_sample_1000.csv,0.487595,0.404592,0.613445,4.42123,0.0,752.0,322.0,5.0,0.0,0.85,0.01,915.0,511.0
2,0.5692,../data/train_test_sample_1000.csv,0.488361,0.406968,0.610451,4.41246,0.0,749.0,328.0,5.0,0.0,0.9,0.01,909.0,514.0
3,0.57,../data/train_test_sample_1000.csv,0.477902,0.389549,0.61809,4.38617,0.0,771.0,304.0,5.0,0.0,0.95,0.01,933.0,492.0
4,0.5816,../data/train_test_sample_1000.csv,0.503795,0.420428,0.628402,4.37539,0.0,732.0,314.0,5.0,0.0,0.99,0.01,923.0,531.0


In [8]:
# results_df.sort_values('Accuracy')

In [9]:
# pareceria ser que los valores de los thresholds en 0.2 y 0.95 son los mejores.
# Igual es un conjunto de datos bastante chico habria que probar con mas datos

In [23]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [5],
    'alpha': [0],
    'threshold_frecuency_low': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5],
    'threshold_frecuency_high': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99],
}
grid = list(build_grid(grid_ranges))
res = run_grid_in_parallel(grid)

 Point 63/63 done         

In [24]:
res.sort_values('Accuracy', ascending=False).head()

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
30,0.5872,../data/train_test_sample_1000.csv,0.530055,0.460808,0.623794,0.50937,0.0,681.0,351.0,5.0,0.0,0.99,0.2,886.0,582.0
13,0.5856,../data/train_test_sample_1000.csv,0.52302,0.449723,0.624862,0.844421,0.0,695.0,341.0,5.0,0.0,0.99,0.1,896.0,568.0
20,0.5844,../data/train_test_sample_1000.csv,0.525354,0.455265,0.62095,0.679705,0.0,688.0,351.0,5.0,0.0,0.99,0.15,886.0,575.0
8,0.5844,../data/train_test_sample_1000.csv,0.521419,0.448139,0.623348,1.70399,0.0,697.0,342.0,5.0,0.0,0.99,0.05,895.0,566.0
59,0.5816,../data/train_test_sample_1000.csv,0.503795,0.420428,0.628402,6.06411,0.0,732.0,314.0,5.0,0.0,0.99,0.01,923.0,531.0


In [6]:
def score_metrics(scores, metrics=['Accuracy', 'F1','Precision', 'Recall'], top=5):
    for metric in metrics:
        for t in range(top):
            scores.sort_values(metric, ascending=False).iloc[t]
        
        
    


In [18]:
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

grid_ranges = {
    'dataset': ['../data/train_test_sample_1000.csv'],
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20],
    'alpha': [25, 50, 75, 100, 150],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.99],
}
grid = list(build_grid(grid_ranges))
res_2 = run_grid_in_parallel(grid)

 Point 45/45 done         

In [19]:
res_2.sort_values('Precision', ascending=False)

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
11,0.57,../data/train_test_sample_1000.csv,0.539615,0.498812,0.587687,9.90495,100.0,633.0,442.0,1.0,1.0,0.99,0.05,795.0,630.0
0,0.5464,../data/train_test_sample_1000.csv,0.524329,0.494854,0.557538,9.65571,25.0,638.0,496.0,1.0,1.0,0.99,0.05,741.0,625.0
7,0.568,../data/train_test_sample_1000.csv,0.534884,0.491686,0.586402,9.80549,75.0,642.0,438.0,1.0,1.0,0.99,0.05,799.0,621.0
15,0.5652,../data/train_test_sample_1000.csv,0.530859,0.486936,0.583491,10.0974,150.0,648.0,439.0,1.0,1.0,0.99,0.05,798.0,615.0
4,0.5584,../data/train_test_sample_1000.csv,0.526587,0.486144,0.574369,9.7764,50.0,649.0,455.0,1.0,1.0,0.99,0.05,782.0,614.0
8,0.58,../data/train_test_sample_1000.csv,0.535398,0.479018,0.60682,9.82377,75.0,658.0,392.0,3.0,1.0,0.99,0.05,845.0,605.0
3,0.5732,../data/train_test_sample_1000.csv,0.530163,0.476643,0.597222,9.75154,50.0,661.0,406.0,3.0,1.0,0.99,0.05,831.0,602.0
10,0.5904,../data/train_test_sample_1000.csv,0.537906,0.471892,0.625393,9.83525,75.0,667.0,357.0,5.0,1.0,0.99,0.05,880.0,596.0
12,0.5708,../data/train_test_sample_1000.csv,0.524169,0.467933,0.595766,9.92552,100.0,672.0,401.0,3.0,1.0,0.99,0.05,836.0,591.0
1,0.5516,../data/train_test_sample_1000.csv,0.513244,0.467933,0.568269,9.66071,25.0,672.0,449.0,3.0,1.0,0.99,0.05,788.0,591.0


In [11]:
#Experimentamos variando el tamaño del train test
#KNN + PCA
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,1000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [15],
    'alpha': [50],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.1],
}
grid = list(build_grid(grid_ranges))
res_3 = run_grid_in_parallel(grid)

 Point 25/25 done         

In [12]:
res_3

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.636643,../data/train_test_sample_1000.csv,0.651912,0.679557,0.626429,3.85764,50.0,2403.0,3039.0,15.0,1.0,0.1,0.05,4439.0,5096.0
1,0.654447,../data/train_test_sample_2000.csv,0.656944,0.660314,0.653608,5.90536,50.0,2549.0,2626.0,15.0,1.0,0.1,0.05,4846.0,4955.0
2,0.648932,../data/train_test_sample_3000.csv,0.665437,0.70004,0.634093,8.28207,50.0,2241.0,3018.0,15.0,1.0,0.1,0.05,4491.0,5230.0
3,0.678614,../data/train_test_sample_4000.csv,0.691287,0.716602,0.667699,10.5815,50.0,2132.0,2683.0,15.0,1.0,0.1,0.05,4776.0,5391.0
4,0.670516,../data/train_test_sample_5000.csv,0.6697,0.666223,0.673214,12.8507,50.0,2507.0,2429.0,15.0,1.0,0.1,0.05,5041.0,5004.0
5,0.672807,../data/train_test_sample_6000.csv,0.66337,0.652438,0.674675,15.4584,50.0,2573.0,2329.0,15.0,1.0,0.1,0.05,5250.0,4830.0
6,0.675613,../data/train_test_sample_7000.csv,0.678148,0.683536,0.672844,18.4795,50.0,2370.0,2489.0,15.0,1.0,0.1,0.05,5001.0,5119.0
7,0.681885,../data/train_test_sample_8000.csv,0.67472,0.661891,0.688057,23.2202,50.0,2525.0,2241.0,15.0,1.0,0.1,0.05,5273.0,4943.0
8,0.683972,../data/train_test_sample_9000.csv,0.687516,0.694204,0.680957,28.312,50.0,2295.0,2441.0,15.0,1.0,0.1,0.05,5040.0,5210.0
9,0.672762,../data/train_test_sample_10000.csv,0.658351,0.638206,0.67981,32.8631,50.0,2678.0,2225.0,15.0,1.0,0.1,0.05,5356.0,4724.0


In [23]:
res_3.to_csv('KNN_PCA_TrainTestSize.csv')

In [14]:
#Experimentamos variando el tamaño del train test
#KNN
import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,1000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [15],
    'alpha': [0],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.1],
}
grid = list(build_grid(grid_ranges))
res_4 = run_grid_in_parallel(grid)

 Point 25/25 done         

In [15]:
res_4

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.631301,../data/train_test_sample_1000.csv,0.585995,0.521136,0.669293,5.13904,0.0,3591.0,1931.0,15.0,0.0,0.1,0.05,5547.0,3908.0
1,0.643229,../data/train_test_sample_2000.csv,0.615473,0.569829,0.669066,13.1824,0.0,3228.0,2115.0,15.0,0.0,0.1,0.05,5357.0,4276.0
2,0.638117,../data/train_test_sample_3000.csv,0.666009,0.723464,0.617009,22.2432,0.0,2066.0,3355.0,15.0,0.0,0.1,0.05,4154.0,5405.0
3,0.669804,../data/train_test_sample_4000.csv,0.673185,0.677256,0.669162,29.406,0.0,2428.0,2519.0,15.0,0.0,0.1,0.05,4940.0,5095.0
4,0.659235,../data/train_test_sample_5000.csv,0.677164,0.712821,0.644905,37.7483,0.0,2157.0,2948.0,15.0,0.0,0.1,0.05,4522.0,5354.0
5,0.6664,../data/train_test_sample_6000.csv,0.649263,0.624882,0.675624,45.2552,0.0,2777.0,2221.0,15.0,0.0,0.1,0.05,5358.0,4626.0
6,0.660992,../data/train_test_sample_7000.csv,0.642092,0.608225,0.679952,55.2213,0.0,2934.0,2144.0,15.0,0.0,0.1,0.05,5346.0,4555.0
7,0.669003,../data/train_test_sample_8000.csv,0.666846,0.664569,0.669138,60.5835,0.0,2505.0,2454.0,15.0,0.0,0.1,0.05,5060.0,4963.0
8,0.665688,../data/train_test_sample_9000.csv,0.642857,0.600799,0.691246,68.587,0.0,2996.0,2014.0,15.0,0.0,0.1,0.05,5467.0,4509.0
9,0.66175,../data/train_test_sample_10000.csv,0.665964,0.682518,0.650193,76.9219,0.0,2350.0,2718.0,15.0,0.0,0.1,0.05,4863.0,5052.0


In [24]:
res_4.to_csv('KNN_TrainTestSize.csv')

In [None]:
#Como se relaciona k con el tamaño del training set

In [17]:
#Experimentamos variando el tamaño del train test y k
#KNN + PCA

import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,2000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [1],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20, 50, 100, 250, 500],
    'alpha': [50],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.1],
}
grid = list(build_grid(grid_ranges))
res_5 = run_grid_in_parallel(grid)

 Point 169/169 done         

In [18]:
res_5

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.574748,../data/train_test_sample_1000.csv,0.582224,0.591812,0.572941,3.83370,50.0,3061.0,3308.0,1.0,1.0,0.1,0.05,4170.0,4438.0
1,0.641984,../data/train_test_sample_1000.csv,0.631781,0.613415,0.651281,3.84337,50.0,2899.0,2463.0,20.0,1.0,0.1,0.05,5015.0,4600.0
2,0.594779,../data/train_test_sample_1000.csv,0.607210,0.625550,0.589914,3.83406,50.0,2808.0,3261.0,3.0,1.0,0.1,0.05,4217.0,4691.0
3,0.672631,../data/train_test_sample_1000.csv,0.683575,0.706227,0.662331,3.85183,50.0,2203.0,2700.0,100.0,1.0,0.1,0.05,4778.0,5296.0
4,0.677572,../data/train_test_sample_1000.csv,0.705818,0.772503,0.649731,3.87407,50.0,1706.0,3123.0,250.0,1.0,0.1,0.05,4355.0,5793.0
5,0.616345,../data/train_test_sample_1000.csv,0.634478,0.665022,0.606617,3.87274,50.0,2512.0,3234.0,7.0,1.0,0.1,0.05,4244.0,4987.0
6,0.636643,../data/train_test_sample_1000.csv,0.651912,0.679557,0.626429,3.85728,50.0,2403.0,3039.0,15.0,1.0,0.1,0.05,4439.0,5096.0
7,0.632169,../data/train_test_sample_1000.csv,0.498589,0.365249,0.785264,3.91090,50.0,4760.0,749.0,500.0,1.0,0.1,0.05,6729.0,2739.0
8,0.628230,../data/train_test_sample_1000.csv,0.611065,0.583278,0.641631,3.90041,50.0,3125.0,2443.0,10.0,1.0,0.1,0.05,5035.0,4374.0
9,0.570007,../data/train_test_sample_1000.csv,0.462437,0.369383,0.618166,3.87129,50.0,4729.0,1711.0,2.0,1.0,0.1,0.05,5767.0,2770.0


In [25]:
res_5.to_csv('KNN_PCA_TrainTestSize_and_K.csv')

In [20]:
#Experimentamos variando el tamaño del train test y k
#KNN

import sys
from multiprocessing import Pool

def run_grid_in_parallel(grid, n_proc=18):
    """Run a grid on n_proc workers"""
    done, results = 0, []
    
    def call_back(out):
        nonlocal done
        nonlocal results
        results.append(out)
        done += 1
        print('\r', f'Point {done}/{len(grid)} done', end='         ')
        sys.stdout.flush()

    with Pool(n_proc) as pool:
        for point in grid:
            pool.apply_async(run, args=(EXEC_PATH, point, ), callback=call_back)
        pool.close()
        pool.join()
    return pd.DataFrame(results)

file_names = ['../data/train_test_sample_' + str(i) + '.csv' for i in range(1000,25001,2000)]
grid_ranges = {
    'dataset': file_names,
    'output_file': ['../../datos/out.csv'], # Todavia no hace nada
    'mode': [0],
    'k': [1, 2, 3, 5, 7, 9, 10, 15, 20, 50, 100, 250, 500],
    'alpha': [0],
    'threshold_frecuency_low': [0.05],
    'threshold_frecuency_high': [0.1],
}
grid = list(build_grid(grid_ranges))
res_6 = run_grid_in_parallel(grid)

 Point 169/169 done         

In [21]:
res_6

Unnamed: 0,Accuracy,Dataset File,F1,Precision,Recall,Time,alpha,fn,fp,k,mode,threshold_frecuency_high,threshold_frecuency_low,tn,tp
0,0.565400,../data/train_test_sample_1000.csv,0.429885,0.327244,0.626340,5.23086,0.0,5045.0,1464.0,2.0,0.0,0.1,0.05,6014.0,2454.0
1,0.657475,../data/train_test_sample_1000.csv,0.673123,0.704361,0.644539,5.22908,0.0,2217.0,2913.0,100.0,0.0,0.1,0.05,4565.0,5282.0
2,0.619617,../data/train_test_sample_1000.csv,0.592577,0.552474,0.638957,5.33733,0.0,3356.0,2341.0,7.0,0.0,0.1,0.05,5137.0,4143.0
3,0.608533,../data/train_test_sample_1000.csv,0.591058,0.565009,0.619626,5.38600,0.0,3262.0,2601.0,5.0,0.0,0.1,0.05,4877.0,4237.0
4,0.591841,../data/train_test_sample_1000.csv,0.581788,0.567009,0.597359,5.38469,0.0,3247.0,2866.0,3.0,0.0,0.1,0.05,4612.0,4252.0
5,0.620151,../data/train_test_sample_1000.csv,0.521973,0.414189,0.705588,5.39023,0.0,4393.0,1296.0,20.0,0.0,0.1,0.05,6182.0,3106.0
6,0.631301,../data/train_test_sample_1000.csv,0.585995,0.521136,0.669293,5.38581,0.0,3591.0,1931.0,15.0,0.0,0.1,0.05,5547.0,3908.0
7,0.611671,../data/train_test_sample_1000.csv,0.523201,0.425523,0.679081,5.41744,0.0,4308.0,1508.0,10.0,0.0,0.1,0.05,5970.0,3191.0
8,0.630834,../data/train_test_sample_1000.csv,0.539978,0.432724,0.717920,5.37294,0.0,4254.0,1275.0,50.0,0.0,0.1,0.05,6203.0,3245.0
9,0.669760,../data/train_test_sample_1000.csv,0.619011,0.535805,0.732811,5.44633,0.0,3481.0,1465.0,500.0,0.0,0.1,0.05,6013.0,4018.0


In [26]:
res_6.to_csv('KNN_TrainTestSize_and_K.csv')