In [None]:
%run nbloader.py
import seaborn
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d.art3d import Line3DCollection
from matplotlib import pylab as plt
import root_numpy
import pandas as pd
import math
import numpy as np


BRICK_X = 124000
BRICK_Y = 99000
BRICK_Z = 75000
SAFE_M = 10000
dZ = 205

In [None]:
from read_opera_bg import load_bg
from read_opera_mc import load_mc
pbg = load_bg(step=50)
pmc = load_mc(step=1)

In [None]:
def alpha_creator(dmix, pmc, id):
    dZ = 204
    electron = pmc.iloc[id][['ele_x', 'ele_y', 'ele_z', 'ele_sx', 'ele_sy']]
    xb = dmix['b_x'] - electron['ele_x']
    yb = dmix['b_y'] - electron['ele_y']
    zb = dmix['b_z'] - electron['ele_z']
    
    xe = dZ * electron['ele_sx']
    ye = dZ * electron['ele_sy']
    ze = dZ
    
    
    dmix['alpha'] = (xb * xe + yb * ye + zb * ze)/(((xb**2 + yb**2 + zb**2)**0.5) * ((xe**2 + ye**2 + ze**2)**0.5))
    dmix['alpha'] = dmix.apply(lambda x: math.acos(x['alpha']), axis=1)
    
    return dmix

def feature_creator(dmix, pmc, id):
    dZ = 204
    electron = pmc.iloc[id][['ele_x', 'ele_y', 'ele_z', 'ele_sx', 'ele_sy']]
    
    xb_2 = dZ * dmix['b_sx']
    yb_2 = dZ * dmix['b_sy']
    zb_2 = dZ
    
    xe = dZ * electron['ele_sx']
    ye = dZ * electron['ele_sy']
    ze = dZ
    
    dmix['alpha_2'] = (xb_2 * xe + yb_2 * ye + zb_2 * ze)/(((xb_2**2 + yb_2**2 + zb_2**2)**0.5) * ((xe**2 + ye**2 + ze**2)**0.5))
    dmix['alpha_2'] = dmix.apply(lambda x: math.acos(x['alpha_2']), axis=1)
    
    #xp = dmix['b_x']  - zb * dmix['b_sx'] 
    #yp = dmix['b_y']  - zb * dmix['b_sy'] 
    #dmix['IP'] = ((electron['ele_sx'] - xp)**2 + (electron['ele_sy'] - yp)**2)**0.5 / zb
    
    zb = dmix['b_z'] - electron['ele_z']
    xp = dmix['b_x']  + dZ * dmix['b_sx'] 
    yp = dmix['b_y']  + dZ * dmix['b_sy']
    zp = dmix['b_z'] + dZ
    
    el = np.array(electron[0:3], dtype = float)
    IP_0 = np.array([xp, yp, zp], dtype = float)
    IP_1 = np.array([dmix['b_x'], dmix['b_y'], dmix['b_z']], dtype = float)
    length = len(xp)
    IP = []
    
    for i in range(length):
        IP.append(np.linalg.norm(np.cross(el - IP_0[:,i], el - IP_1[:,i]))/np.linalg.norm(IP_0[:,i] - IP_1[:,i]))
    
    dmix['IP'] = IP
    dmix['IP'] = dmix['IP']/zb
     
    
    dmix['d_sx'] = electron['ele_sx'] - dmix['b_sx']
    dmix['d_sy'] = electron['ele_sy'] - dmix['b_sy']
    dmix['zb'] = dmix['b_z'] - electron['ele_z']
                               
    return dmix
                  


def combine_mc_bg(pmc, pbg, cone_angle = 0.05, begin = 0, end = 50):
    dZ = 204
    dmix = pd.DataFrame([
                pbg['s.eX'],
                pbg['s.eY'],
                pbg['s.eZ'],
                pbg['s.eTX'],
                pbg['s.eTY']],
                index=['b_x', 'b_y', 'b_z', 'b_sx', 'b_sy']).T
    dmix['signal'] = 0
    final_data = pd.DataFrame().T

    for i in range(begin ,end):
        df = pmc.iloc[i]
        d0 = pd.DataFrame([
                df['BT_X'],
                df['BT_Y'],
                df['BT_Z'] - df['BT_Z'] % 1293,
                df['BT_SX'],
                df['BT_SY']],
                index=['b_x', 'b_y', 'b_z', 'b_sx', 'b_sy']).T
        d0['signal'] = 1
        
    
        data = pd.concat([dmix, d0]) #pd.concat([dmix[i::a], d0])
        data['event'] = df['Event_id']
        data = alpha_creator(data, pmc, i) # change pmc with electron to save time
        data = data.loc[data['alpha'] < cone_angle]  #save data only inside cone
        if len(data.loc[data['signal'] == 0]['b_x']) != 0: #if there isn't any noise on 
                                                           #the way of shower - delete                                                # make sense?
            data = feature_creator(data, pmc, i) #create fetures only for useful events
            final_data = pd.concat([data, final_data])
        
    
    return final_data

In [None]:
dmix = combine_mc_bg(pmc, pbg, 0.05, 0, 1)
features = list(set(dmix.columns) - {'event', 'signal', 'b_x', 'b_y', 'b_z', 'b_sx', 'b_sy'})

In [None]:
%matplotlib inline
hist_params = {'normed': True, 'bins': 60, 'alpha': 0.6}

plt.figure(figsize=(10, 8))
for n, feature in enumerate(features):
    plt.subplot(len(features) // 2, 3, n+1)
    min_value, max_value = np.percentile(dmix[feature], [1, 99])
    plt.hist(dmix.ix[dmix.signal.values == 0, feature].values, range=(min_value, max_value), 
             label='class 0', **hist_params)
    plt.hist(dmix.ix[dmix.signal.values == 1, feature].values, range=(min_value, max_value), 
             label='class 1', **hist_params)
    plt.legend(loc='best')
    plt.title(feature)

In [None]:
plt.figure(figsize=[16,8])

plt.title("zb")
plt.scatter(dmix.zb, np.random.normal(0,1,len(dmix.zb)) , c= dmix.signal,
            alpha = 0.5)

plt.colorbar()

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

training_data, validation_data = train_test_split(dmix, random_state=11, train_size=0.9)

from sklearn.ensemble import GradientBoostingClassifier as gbm
gradientBoosting = gbm(n_estimators = 200, max_depth = 5)
gradientBoosting.fit(training_data[features].astype(np.float64), training_data.signal.astype(np.bool))

In [None]:
gbm_proba = gradientBoosting.predict_proba(validation_data[features].astype(np.float64))
print roc_auc_score(validation_data.signal, gbm_proba[:,1])

In [None]:
from sklearn.metrics import roc_curve
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier as gbm

fpr = dict()
tpr = dict()
models = dict()

for alpha in list([0.06, 0.075, 0.09]):
    dmix = combine_mc_bg(pmc, pbg, alpha)
    features = list(set(dmix.columns) - {'event', 'signal', 'b_x', 'b_y', 'b_z', 'b_sx', 'b_sy'})
    training_data, validation_data = train_test_split(dmix, random_state=11, train_size=0.9)
    
    gradientBoosting = gbm(n_estimators = 200, max_depth = 5)
    gradientBoosting.fit(training_data[features].astype(np.float64), training_data.signal.astype(np.bool))
    gbm_proba = gradientBoosting.predict_proba(validation_data[features].astype(np.float64))
    fpr[alpha], tpr[alpha], _ = roc_curve(validation_data.signal, gbm_proba[:,1])
    models["model{0}".format(alpha)] = gradientBoosting
    
    print str(alpha) + ": " + str(roc_auc_score(validation_data.signal, gbm_proba[:,1]))

In [None]:
%matplotlib inline
for alpha in list([0.06, 0.075, 0.09]):
    plt.plot(fpr[alpha],tpr[alpha])
plt.show()

In [None]:
from sklearn.externals import joblib
joblib.dump(models['model0.06'], 'model06.pkl');
joblib.dump(models['model0.075'], 'model75.pkl');
joblib.dump(models['model0.09'], 'model9.pkl');