Código tomado y adaptado.

Título: Pulsar Candidates Classification

Autor: Yixuan Zhou

Fecha: 12/01/2018

Fuente: https://github.com/yixuanzhou/Pulsar-Candidates-Calssification

In [1]:
import pandas as pd
import os
from sklearn import model_selection, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import numpy as np

In [2]:
def upsampling(X, y, ratio = 1.0, random_state = 123):
    pos_num = sum(y)
    target_num = int(0.5 + ratio * (len(y) - pos_num))
    
    if not pos_num < target_num:
        return X, y
    
    X_pos = X[np.where(y == 1)[0]]
    X_pos = np.concatenate([X_pos for _ in range(target_num // pos_num[0])] + [X_pos[:target_num % pos_num[0]]])
    X_neg = X[np.where(y == 0)[0]]
    X, y = np.concatenate([X_pos, X_neg]), np.array([1] * len(X_pos) + [0] * len(X_neg))
    
    return shuffle(X, y, random_state = random_state)

In [3]:
def test_score(model, X, y):

    y_pred = model.predict(X)
    recall = recall_score(y, y_pred)
    prec = precision_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    return recall, prec, roc_auc, f1

In [4]:
# Load data set
df = pd.read_csv("../HTRU_2.csv", header = None)
X = df.iloc[:,:-1]
y = df.iloc[:,-1:].values

# Normalization
normalizer = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)
X_train, y_train = upsampling(X_train, y_train, ratio = 1/5)

# Train model with fine-tuned parameters
knn = KNeighborsClassifier(n_neighbors = 5)
clf_knn = knn.fit(X_train, y_train)

# Cross validation
cv = ShuffleSplit(n_splits = 5, test_size = 0.3, random_state = 95)
res = {}

for scoring in ('f1', 'roc_auc', 'precision', 'recall'):
    res[scoring] = cross_val_score(clf_knn, X_test, y_test, cv = cv, scoring = scoring, n_jobs = -1)

In [5]:
print('Línea base - KNN')
print('\t-Mejores hiper-parámetros: {''k'': ''5'')')
print('\t-Puntaje f1 en entrenamiento: %.4f' % test_score(clf_knn, X_train, y_train)[3])
print('\t-Puntaje f1 en pruebas: %.4f' % test_score(clf_knn, X_test, y_test)[3])

Línea base - KNN
	-Mejores hiper-parámetros: {k: 5)
	-Puntaje f1 en entrenamiento: 0.9253
	-Puntaje f1 en pruebas: 0.8787
