In [1]:
# все библиотеки

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from itertools import product
%matplotlib inline
import seaborn
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
# считывание данных

df_pion = pd.read_csv('data/v1_pion_train.csv')
df_kaon = pd.read_csv('data/v1_kaon_train.csv')
df_electron = pd.read_csv('data/v1_electron_train.csv')
df_ghost = pd.read_csv('data/v1_ghost_train.csv')
df_muon = pd.read_csv('data/v1_muon_train.csv')
df_proton = pd.read_csv('data/v1_proton_train.csv')

In [3]:
x_cols = ['TrackP', 'TrackEta', 'NumLongTracks']
y_cols = ['RichDLLbt', 'RichDLLk', 'RichDLLmu', 'RichDLLp', 'RichDLLe']

In [23]:
def create_masks(count, features, bins):
    masks = {}
    for col in features.columns:
        mask = np.zeros((len(features[col]), count), dtype=np.bool)
        mask[:, 0], mask[:, -1] = features[col] <= bins[col][0], features[col] > bins[col][-1]
        for i in range(count - 2):
            mask[:,i+1] = (bins[col][i] < features[col]) & (features[col] <= bins[col][i+1])
        masks[col] = mask
    return masks

In [24]:
class Model:
    def train(self, X, Y, n_bins=5):
        self.means = dict.fromkeys(Y.columns, np.zeros((n_bins, n_bins, n_bins)))
        self.stds = dict.fromkeys(Y.columns, np.zeros((n_bins, n_bins, n_bins)))
        self.bins = {}
        for col in X.columns:
            self.bins[col] = np.percentile(X[col], 100*np.linspace(1./n_bins, (n_bins-1)/n_bins,n_bins-1))
        self.masks = create_masks(n_bins, X, self.bins)
        
        for i in range(n_bins):
            for j in range(n_bins):
                for k in range(n_bins):
                    cols = X.columns
                    f_masks = np.logical_and(self.masks[cols[0]][:,i], 
                                             self.masks[cols[1]][:,j],
                                             self.masks[cols[2]][:,k]) 
                    for col in Y.columns:
                        self.means[col][i,j,k] = np.mean(Y[col][f_masks])
                        self.stds[col][i,j,k] = np.std (Y[col][f_masks])
        

    def predict(self, X):
        pred = pd.DataFrame()
        count = np.zeros((self.means['RichDLLk'].shape), dtype=int)
        n_bins = count.shape[0]
        pred_masks = create_masks(n_bins, X, self.bins)

        for i in range(n_bins):
            for j in range(n_bins):
                for k in range(n_bins):
                    count[i,j,k] = np.count_nonzero(np.logical_and(masks[cols[0]][:,i],
                                                                   masks[cols[1]][:,j],
                                                                   masks[cols[2]][:,k]))
        for col in self.means.keys():
            samples = np.array([])
            for i in range(n_bins):
                for j in range(n_bins):
                    for k in range(n_bins):
                        cols = X.columns
                        samples = np.append(samples,
                                         np.random.normal(
                                            loc=self.means[col][i,j,k],
                                            scale=self.stds[col][i,j,k],
                                            size=count[i,j,k]
                                         )
                                        )
            pred[col] = samples
        return pred

In [25]:
from sklearn.model_selection import KFold
from scipy.stats import ks_2samp

In [26]:
#https://github.com/SiLiKhon/RICH_GAN_misc/blob/master/coopetition/scoring_program/score.py

def score_func(sample1, sample2):
    score = 0
    cols = sample1.columns
    w_normal = np.random.normal(size=(100, len(cols)))
    reference = sample1.copy()[cols].values
    prediction = sample2.copy()[cols].values
    for k in range(100):
        score = max(score,
                    ks_2samp(
                        np.sum(w_normal[k] * reference, axis=1), 
                        np.sum(w_normal[k] * prediction, axis=1)
                    )[0]
                   )
    return score

In [27]:
kf = KFold(n_splits=5)

model_scores = []
best_scores  = []

for train_index, test_index in kf.split(df_pion[x_cols]):
    X_train = df_pion[x_cols].iloc[train_index]
    Y_train = df_pion[y_cols].iloc[train_index]
    X_test  = df_pion[x_cols].iloc[test_index ]
    Y_test  = df_pion[y_cols].iloc[test_index ]
  
    model = Model()
    model.train(X_train.copy(), Y_train.copy())
    Y_pred = model.predict(X_test.copy())
  
    model_scores.append(score_func(Y_test, Y_pred))
    best_scores.append(score_func(Y_test, Y_train))

NameError: name 'masks' is not defined

In [None]:
for col in Y_pred.columns:
    plt.subplots(figsize=(20, 10))
    plt.grid()
    _, bins, _ = plt.hist(Y_test[col], bins=100 , label='test'      )
    _, _   , _ = plt.hist(Y_pred[col], bins=bins, label='prediction', alpha=0.7)
    plt.legend(loc='best')
    plt.xlim(left=-200)
    plt.xlabel(col)
    plt.show();

In [31]:
pd.cut(df_kaon.TrackPt, 3, retbins=True)[1]

array([-6.09064270e+01,  2.68801740e+04,  5.37406729e+04,  8.06011719e+04])

In [37]:
np.array(df_kaon.iloc[:,0])

array([ 733.72082519,  766.3895874 ,  454.28909302, ...,  998.46643066,
        191.11610413, 1095.25842285])

In [None]:
def train(trX, trY, col, target bins=5, n_splits=5)
    lr = LinearRegression(normalize=True)
    _, bound_bins = pd.cut(trX, bins, retbins=True)
    curX, curY = trX[col], trY[target]
    cur_train = pd.concat([curX, curY])
    model_scores = []
    best_scores  = []
    for i in range(1, bound_bins.shape[0]):
        kf = KFold(n_splits=n_splits)
        train_bin = cur_train.loc[(cur_train[col] > bins[i-1]) & (cur_train[col] <= bins[i])]
        for train_index, test_index in kf.split(cur_train):
            b_trX, b_trY = train_bin[col].iloc[train_index], train_bin[target].iloc[train_index]
            b_teX, b_teY = train_bin[col].iloc[test_index], train_bin[target].iloc[test_index]
            lr.fit(b_trX, b_trY)
            model_scores.append(score_func(Y_test, Y_pred))
            best_scores .append(score_func(Y_test, Y_train))
            

def test(teX, teY):

In [None]:
trX, teX, trY, teY = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
    for col in x_cols:
        for target in y_cols:

## GAN

In [None]:
class GAN():
    def __init__(self):
        self.RichDLL_count = 3
        self.features = 5
        optimizer = Adam(0.0002, 0.5)
        
        self.generator = self.build_generator()
        self.discriminator = self.build_discriminator(loss='binary_crossentropy',
                                                      optimizer=optimizer,
                                                      metrics=['accuracy'])
        noise = Input(shape=(self.latent_dim,))
        
        def build_generator(self):
            model = Sequential()
            model.add(Dense(128, input_dim=self.latent_dim))
            model.add(LeakyReLU(alpha=0.1))
            model.add(BatchNormalization(momentum=0.8))
            model.add(Dense(3, activation='tanh'))
            
            model.summary()

            noise = Input(shape=(self.features,))
            RichDLLs = model(noise)
            return Model(noise, RichDLLs)
        
        def build_discriminator(self):
            model = Sequential()
            model.add(Dense(128))
            model.add(LeakyReLU(alpha=0.1))
            model.add(Dense(1, activation='sigmoid'))
            
            model.summary()

            RichDLLs = Input(shape=self.RichDLL_count)
            validity = model(RichDLLs)
            return Model(RichDLLs, validity)
        
        def train(self, epochs, batch_size=128, sample_interval=50):
            #Load the dataset
            for epoch in range(epochs):
                gen_RichDLLs = self.generator.predict(noise)
if __name__ == '__main__':
    gan = GAN()
    gan.train(epochs=30000, batch_size=32, sample_interval=200)

## Cramer GAN