In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# llamadas al sistema
import os

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# xgboost
import xgboost as xgb

# write to file
import csv

import os
import csv
import math



In [2]:
def create_solution_dictionary(solution):
    """ Read solution file, return a dictionary with key EventId and value (weight,label).
    Solution file headers: EventId, Label, Weight """
    
    solnDict = {}
    with open(solution, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            if row[0] not in solnDict:
                solnDict[row[0]] = (row[1], row[2])
    return solnDict

        
def check_submission(submission, Nelements):
    """ Check that submission RankOrder column is correct:
        1. All numbers are in [1,NTestSet]
        2. All numbers are unqiue
    """
    rankOrderSet = set()    
    with open(submission, 'r') as f:
        reader = csv.reader(f)
        next(reader) # header
        for row in reader:
            rankOrderSet.add(row[1])
            
    if len(rankOrderSet) != Nelements:
        print('RankOrder column must contain unique values')
        exit()
    elif rankOrderSet.isdisjoint(set(range(1,Nelements+1))) == False:
        print('RankOrder column must contain all numbers from [1..NTestSset]')
        exit()
    else:
        return True

    
def AMS(s, b):
    """ Approximate Median Significance defined as:
        AMS = sqrt(
                2 { (s + b + b_r) log[1 + (s/(b+b_r))] - s}
              )        
    where b_r = 10, b = background, s = signal, log is natural logarithm """
    
    br = 10.0
    radicand = 2 *( (s+b+br) * math.log (1.0 + s/(b+br)) -s)
    if radicand < 0:
        print('radicand is negative. Exiting')
        exit()
    else:
        return math.sqrt(radicand)


def AMS_metric(solution, submission):
    """  Prints the AMS metric value to screen.
    Solution File header: EventId, Class, Weight
    Submission File header: EventId, RankOrder, Class
    """
    # solutionDict: key=eventId, value=(label, class)
    solutionDict = create_solution_dictionary(solution)

    numEvents = len(solutionDict)
    
    signal = 0.0
    background = 0.0
    if check_submission(submission, numEvents):
        with open(submission, 'r') as f:
            reader = csv.reader(f)
            next(reader) # header row
            for row in reader:
                if row[2] == 's': # only events predicted to be signal are scored
                    if solutionDict[row[0]][0] == 's':
                        signal += float(solutionDict[row[0]][1])
                    elif solutionDict[row[0]][0] == 'b':
                        background += float(solutionDict[row[0]][1])
     
        print('signal = {0}, background = {1}'.format(signal, background))
        print('AMS = ' + str(AMS(signal, background)))
    
def WriteSolutionAndSubmission(solution_df, predicciones):
    """  
        solution_df: data frame con EventId, Label y Weight. Los resultados correctos.
        preddiciones: las predicciones.
    """
    solution = pd.DataFrame()
    solution["EventId"] = solution_df["EventId"]
    solution["Class"] = solution_df["Label"]
    solution["Weight"] = solution_df["Weight"]
    f = open('./output/solution.csv',"w+")
    solution.to_csv('./output/solution.csv', index = False)
    solution.head()
    #Armo Submission File Training:
    submission = pd.DataFrame()
    submission["EventId"] = solution_df["EventId"]
    submission["RankOrder"] = range(len(solution_df))
    submission["Class"] = predicciones
    f = open('./output/submission.csv',"w+")
    submission.to_csv('./output/submission.csv', index = False)

def WriteSubmission(eventsId, rankOrders, predictions):
    submission = pd.DataFrame()
    submission["EventId"] = eventsId
    submission["RankOrder"] = rankOrders
    submission["Class"] = predictions
    f = open('./output/submission.csv',"w+")
    submission.to_csv('./output/submission.csv', index = False)

In [3]:
# Chequear current working directory
# os.getcwd()

In [4]:
# Leer input:
train_df = pd.read_csv('./input/training.csv')
test_df = pd.read_csv('./input/test.csv')
random_subm = pd.read_csv('./input/random_submission.csv')
combine = [train_df, test_df]

In [5]:
# Explorar el training set:
#train_df.head()

In [6]:
# graficar pares
#sns.pairplot(train, hue="Label", vars=["PRI_jet_all_pt", "PRI_jet_subleading_phi"])

In [7]:
# Posibles modelos:
#GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=1e-07, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
dtc = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False)
#dtc

In [8]:
# features, sobre lo que se puede modelar:
features = list(filter(lambda x: x != "EventId" and x != "Weight" and x != "Label", train_df.columns.values))

In [9]:
# Creamos un modelo:
dtc = DecisionTreeClassifier(max_depth=2)
# Lo entrenamos con el conjunto de training:
dtc.fit(train_df[features], train_df["Label"])
# Vemos que resultado da con el conjunto de training:
dtc.score(train_df[features], train_df["Label"])

0.76818399999999998

In [10]:
# Predecimos:
predicciones_train = dtc.predict(train_df[features])
predicciones_test = dtc.predict(test_df[features])

In [11]:
# Quiero testear la función de AMS. El conjunto de test es para la submission, no tiene los resultados.
# Tengo que testearlo con lo de training.
WriteSolutionAndSubmission(train_df, predicciones_train)

In [12]:
AMS_metric('./output/solution.csv', './output/submission.csv')

signal = 467.706639594385, background = 55132.24700197646
AMS = 1.9889281715125862


In [17]:
gb = [xgb.XGBClassifier(max_depth=2, n_estimators=i, reg_lambda=0) for i in range(1,11)]


for cla in gb: 
    cla.fit(train_df[features],  train_df["Label"])

In [18]:
for cla in gb:
    prediciones = cla.predict(train_df[features])
    WriteSolutionAndSubmission(train_df, prediciones)
    print(AMS_metric('./output/solution.csv', './output/submission.csv'))

signal = 467.706639594385, background = 55132.24700197646
AMS = 1.9889281715125862
None
signal = 438.0950675643611, background = 48172.195270770935
AMS = 1.992823897685768
None
signal = 429.5431480429489, background = 33997.22033283691
AMS = 2.324399614959424
None
signal = 434.5183979251309, background = 45371.12215331826
AMS = 2.0364779083037714
None
signal = 447.4114688497362, background = 47475.311742328355
AMS = 2.0499704600147686
None
signal = 423.7517742346038, background = 41935.954240446124
AMS = 2.065559497305451
None
signal = 433.9196281729831, background = 43508.44134334942
AMS = 2.076602132612116
None
signal = 439.6935353572142, background = 45610.02502876734
AMS = 2.0553080212050747
None
signal = 409.8696953236512, background = 40953.662116096355
AMS = 2.0217366709053137
None
signal = 435.49961035203825, background = 34820.89254455443
AMS = 2.3286499079273093
None


In [32]:
predictions = gb[2].predict(test_df[features])
WriteSubmission(test_df["EventId"], range(1,550001), predictions)
check_submission('./output/submission.csv', 550000)

True

In [29]:
len(test_df["EventId"])

550000