## About

In this notebook we prepare a simple solution for the [kaggle challenge on higgs.](https://inclass.kaggle.com/c/mlhep-2016-higgs-detection)

# KNN algorithm

In [6]:
# import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import numpy
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

####### import data ############
#!cd datasets; wget -O public_train_10000.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_train_10000.root
#!cd datasets; wget -O public_train_100000.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_train_100000.root
#!cd datasets; wget -O public_test.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_test.root
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/public_train_10000.root'))
data1 = pandas.DataFrame(root_numpy.root2array('datasets/public_train_100000.root'))
test = pandas.DataFrame(root_numpy.root2array('datasets/public_test.root'))

######### definition features #############
All_features = list(set(data.columns) - {'event_id', 'target'})
high_level_features = ['m_jj', 'm_jjj', 'm_jlv', 'm_wwbb', 'm_bb', 'm_wbb', 'm_lv']
low_level_features =list(set(All_features) - set(high_level_features))
training_data, validation_data = train_test_split(data, random_state=11, train_size=0.66)

##########choose what features for training##########
training_features = high_level_features


from sklearn.neighbors import KNeighborsClassifier
######## Change knn parameters ############
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(training_data[training_features], training_data.target)

# predict validation sample (probability for each event)
proba = knn.predict_proba(validation_data[training_features])
# take probability to be 1 class to compute ROC AUC
roc_auc_score(validation_data.target, proba[:, 1])

0.72059171991203608

## Prepare submission to kaggle

In [70]:
# predict test sample
kaggle_proba = knn.predict_proba(test[low_level_features])[:, 1]
kaggle_ids = test.event_id

In [25]:
from IPython.display import FileLink
def create_solution(ids, proba, filename='baseline.csv'):
    """saves predictions to file and provides a link for downloading """
    pandas.DataFrame({'event_id': ids, 'prediction': proba}).to_csv('datasets/{}'.format(filename), index=False)
    return FileLink('datasets/{}'.format(filename))
    
create_solution(kaggle_ids, kaggle_proba)