# knn 2.0
Se prueba un nuevo método
## Se cargan los datos
Cargar solo los RA y DEC de las coordenadas

In [1]:
import pickle
import numpy as np
from astropy.table import Table, vstack
import matplotlib.pyplot as plt
flag = 0
for i in range(0,24):
    num = "0"+str(i) if i<10 else str(i)
    temp = pickle.load( open( "RA_DEC/"+num+".p", "rb" ) )
    if flag==0:
        t = temp
        flag=1
    else:
        t = vstack([t, temp])
t['RA'] = t['RA']*15
t = t.to_pandas()

Cargar datos con parámetros Q1 y Q2 prim, y C1 y C2

In [2]:
flag = 0
for i in range(0,24):
    num = "0"+str(i) if i<10 else str(i)
    temp = pickle.load(open( "data_tables/"+num+".p", "rb" ))
    if flag==0:
        t = temp
        flag=1
    else:
        t = vstack([t, temp])
t['RA'] = t['RA']*15
t = t.to_pandas()

Función para calcular las distancias

In [3]:
def compute_squared_EDM_method5(X):
    V = spt.distance.pdist(X, 'sqeuclidean')
    return spt.distance.squareform(V)

Función para arreglar límites de 0 y 20 horas.

In [4]:
def fix_xlimits_method(df, r):
    x_0inf = min(df['RA'])
    x_nsup = max(df['RA'])
    cond_a =(df['RA']>= x_0inf)&(df['RA']<x_0inf+r)
    cond_b =(df['RA']> x_nsup-r)&(df['RA']<=x_nsup)
    mask = np.where(cond_a)
    temp = df.loc[mask[0]]
    mask = np.where(cond_b)
    temp['RA'] = temp['RA']+360
    df = df.append(temp, ignore_index=True)
    temp = df.loc[mask[0]]
    temp['RA'] = temp['RA']-360
    df = df.append(temp, ignore_index=True)
    return df

## Algoritmo
Se tienen los siguientes pasos:
### Inicialización de variables globales
Se utilizan a lo largo de toda la ejecución y no cambian.

In [5]:
size = 3
max_size = 6
k = 10
data = fix_xlimits_method(t, max_size)
d = 1.5

### 1.- Crear ventana alrededor del punto de interés
Si no existe una cantidad de k vecinos cercanos aumentar hasta que se cumpla con la cantidad mínima, o simplemente se llegue al máximo de ventana. Define el espacio de trabajo para calcular las distancias.

In [6]:
def new_window(point):
    size_temp = size
    cond_x = (data['RA']>= point[0]-size_temp)&(data['RA']<point[0]+size_temp)
    cond_y = (data['DEC']>= point[1]-size_temp)&(data['DEC']< point[1]+size_temp)
    mask = np.where(cond_x&cond_y)
    data_aux = data[['RA','DEC']].loc[mask]
    while((len(data_aux)-1 < k) and (size<max_size)):
        size_temp = 2*size_temp
        cond_x = (data['RA']>= point[0]-size_temp)&(data['RA']<point[0]+size_temp)
        cond_y = (data['DEC']>= point[1]-size_temp)&(data['DEC']< point[1]+size_temp)
        mask = np.where(cond_x&cond_y)
        data_aux = data[['RA','DEC']].loc[mask]
    return data_aux

## 2.- Calcular distancias solo para el punto de interés
Se procede a obtener las distancias del punto hacia el resto

In [7]:
from scipy.spatial import distance

def distancias(point, window):
    if len(np.shape(point)) == 1:
        point = np.expand_dims(point,axis=1)
    D = distance.cdist(point.T, window, 'euclidean')
    window['distance'] = D.T
    return window

### 3.- Filtrar por distancia los vecinos o por k
Función que descarta los datos de los vecinos superiores a un k fijado

In [8]:
def k_near_filter(point, neighbors):
    neighbors = neighbors.sort_values(by=['distance'])
    n = len(neighbors)-1
    if (n>=k):
        neighbors = neighbors.iloc[1:k+1,:]
    else:
        neighbors = neighbors.iloc[1:k+1,:]
    return neighbors

Función que descarta los datos de los vecinos superiores a un radio "d"

In [9]:
def d_near_filter(point, neighbors, d):
    neighbors = neighbors.sort_values(by=['distance'])
    mask = np.where(neighbors['distance'] <= d)
    return neighbors.loc[mask]

### 4.- Calcular los $\bar{Q}$

In [24]:
def Q_calculator(point, neighbors):
    cond_x = (data['RA']>= neighbors['RA'].min())&(data['RA']<=neighbors['RA'].max())
    cond_y = (data['DEC']>= neighbors['DEC'].min())&(data['DEC']<= neighbors['DEC'].max())
    mask = np.where(cond_x&cond_y)
    data_aux = data.loc[mask]
    n = len(data_aux)
    Q1 = point[2]*n/(n-1) - data_aux['Q1prim'].mean()*n/(n-1)
    Q2 = point[4]*n/(n-1) - data_aux['Q2prim'].mean()*n/(n-1)
    point = np.append(point,Q1)
    point = np.append(point,Q2)
    
    return point

### 5.- Función que junta del paso 1 al 4

In [28]:
from multiprocessing import Pool, TimeoutError
import time

def Q_features_knn(point):
    checkpoint = "iniciado Q_features_knn"
    try:
        neighbors = new_window(point)
        checkpoint = "new_window"
        neighbors_distances = distancias(point[0:2], neighbors)
        checkpoint = "distancias"
        neighbors_near_filtered = k_near_filter(point, neighbors_distances)
        checkpoint = "k_near_filter"
        point = Q_calculator(point, neighbors_near_filtered)
        checkpoint = "Q_calculator"
        return point
    except:
        print("Falla al intentar analizar punto "+str(point)+", luego de "+checkpoint)


## Ejecución utilizando paralelismo

In [29]:
def thread_analyze():
    print("Ejecutando en paralelo...")
    iterables = data.iloc[:,:].values
    start = time.process_time()
    pool = Pool(processes=6)
    result = pool.map(Q_features_knn, iterables)
    end = time.process_time()
    wait = end - start
    print('Ejecución en paralelo demoró: '+str(wait))
    #pickle.dump(result, open( "knn_data/parallel_r_"+str(r)+"_data.p", "wb" ))
    pool.terminate()
    return result

In [31]:
resultado = thread_analyze()

Ejecutando en paralelo...
Ejecución en paralelo demoró: 1.041662672000001
