### Load and manage datas - Basics

In [None]:
def get_data(file):
    data_folder = 'Data_updated'
    if file=="simulation_bkg":
        file_name = os.path.join(data_folder,"background.txt")
    elif file=="simulation_signal":
        file_name = os.path.join(data_folder,"MC_signal.txt")
    elif file=="data":
        file_name = os.path.join(data_folder,"data_lhcb.txt")
    else:
        raise Exception("file should be 'simulation_bkg', 'simulation_signal' or 'data'.")
        return 
    return pd.read_csv(file_name, index_col=0, sep=",")

In [None]:
def check_for_none(data, file):
    check = data.isnull().any()
    if np.sum(check)==0:
        return
    else:
        print('File', file, 'has some None values.')
        return

In [None]:
def load_all_datas(labels=True, check_none=True):
    files = ['simulation_bkg', 'simulation_signal', 'data']
    datas = []
    if labels==False:
        for file in files:
            datas.append(get_data(file))
    else:
        for file in files:
            data = get_data(file)
            check_for_none(data, file)
            if file=='simulation_bkg':
                data['Label'] = 0
            if file=='simulation_signal':
                data['Label'] = 1
            datas.append(data)
    return datas

In [None]:
def get_complementary_cut(data):
    cut_1 = data["DeltaM_F"]<360
    cut_2 = data["Lambda_b0_MM_F"]>5550 
    cut_3 = data["Lambda_b0_MM_F"]<5680
    data = data[cut_1 | (cut_2 & cut_3)]
    return data

### Load and Manage datas - Advanced

In [None]:
def clean_datas(data, only_useful=False, inplace=False, only_unbiased=False):
    meaningless = ["Lambda_b0_BKGCAT_F","lcstar_BKGCAT_F","Lambda_c_BKGCAT_F"]
    biased = ["lcstar_MM_F", "Lambda_b0_MM_F", "DeltaM_F", "pair_lcstar_F"]
    useless = ["tau_pion0_ProbNNpi_F", "tau_pion1_ProbNNpi_F", "tau_pion2_ProbNNpi_F", 
                "lcstar_pim_ProbNNpi_F", "lcstar_pip_ProbNNpi_F", "Lambda_b0_ENDVERTEX_CHI2_F", 
                "Lambda_c_ENDVERTEX_CHI2_F", "lcstar_ENDVERTEX_CHI2_F"]
    drop = meaningless
    if only_useful:
        drop += useless
    if only_unbiased:
        drop += biased
    col_left = set(data.columns)
    drop = col_left & set(drop)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        if inplace:
            data.drop(columns=drop, inplace=True)
            return
        else:
            return data.drop(columns=drop)

In [None]:
def get_useful_features(datas):
    return_datas = []
    for data in datas:
        return_datas.append(clean_datas(data, only_useful=True, inplace=False))
    return return_datas

In [None]:
class correlation_cleaner():
    
    def __init__(self, keep=None, above=0.9):
        self.keep = keep
        self.above = above
        self.discarded = set()
        
    def fit(self, data):
        corr_mat = data.corr()
        corr_mat[corr_mat.isnull()] = 0
        corr_mat = corr_mat.values
        col_names = data.columns
        for i in range(corr_mat.shape[1]):
            corr_mat[i,i] = 0
        col_discard = set()
        if self.keep!=None:
            max_val = np.max(np.abs(corr_mat))
            for i in range(data.shape[1]):
                for j in range(i+1, data.shape[1]):
                    if np.abs(corr_mat[i,j])>0.9*max_val:
                        col_discard.add(col_names[i])
                        break
        else:
            for i in range(data.shape[1]):
                for j in range(i+1, data.shape[1]):
                    if np.abs(corr_mat[i,j])>self.above:
                        col_discard.add(col_names[i])
                        break
        self.discard = col_discard
        self.discarded = self.discarded.union(col_discard)
        #print('Column(s) about to be discarded:', self.discard)
        
    def transform(self, data, inplace=False):
        if inplace==False:
            return data.drop(columns=self.discarded)
        else:
            data.drop(columns=self.discarded, inplace=True)
        
    def fit_transform(self, data, inplace_=False, show_corr_mat=False):
        self.fit(data)
        if show_corr_mat:
            show_corr(data, small=True)
        if inplace_==False:
            return data.drop(columns=self.discard)
        else:
            data.drop(columns=self.discard, inplace=True)

In [None]:
def get_uncorrelated_features(datas, cleaner):
    return_datas = []
    for data in datas:
        return_datas.append(cleaner.transform(data, inplace=False))
    return return_datas

In [None]:
def take_test_set(datas_or, write_on_file=True):
    data_sim_sign = datas_or[1].sample(frac=1).reset_index(drop=True)
    data_bkg = datas_or[0].sample(frac=1).reset_index(drop=True)
    train_size = 0.7
    data_sig_train = data_sim_sign[:int(train_size*len(data_sim_sign))]
    data_bkg_train = data_bkg[:int(train_size*len(data_bkg))]
    data_train = [data_bkg_train, data_sig_train]
    data_sig_test = data_sim_sign[int(train_size*len(data_sim_sign)):]
    data_bkg_test = data_bkg[int(train_size*len(data_bkg)):]
    data_test = data_sig_test.append(data_bkg_test[:len(data_sig_test)],
                                     ignore_index=True).sample(frac=1).reset_index(drop=True)
    x_test = data_test.iloc[:,:-1]
    y_test = data_test.iloc[:,-1]
    if write_on_file:
        x_test.to_csv('Data_test/x_test.csv', mode='w+')
        y_test.to_csv('Data_test/y_test.csv', mode='w+')
    return data_train, x_test, y_test

### Show Datas

In [None]:
def show_datas(datas, density=True, cuts=False, choose_logs=False):
    col_names = datas[0].columns.tolist()
    if "Label" in col_names:
        col_names.remove("Label")

    fig = plt.figure(figsize=(25,100))
    ranges = [None]*len(col_names)
    if cuts:
        ranges = [None, None, [3000,8000],None, [-15,25], None,
                 [-1.5,1.5], [-2,2], [0,10], None, None, None,
                 None, None, None, None, None, None,
                 None, None, None, None, None, None, None]
    logs = [False]*len(col_names)
    if choose_logs:
        logs = [True, True, True, True, False, False,
              False, False, False, False, False, False,
              True, True, False, True, True, True,
              True, True, True, True, False, False, False]
    colors = ["red", "green", "blue"]
    ind=0
    for col, range_, log in zip(col_names, ranges, logs):
        ind+=1
        ax = plt.subplot(math.ceil(len(col_names)/2),2,ind)
        ax.set_title(col)
        for data, color in zip(datas, colors):
            ax.hist(data[col], color=color, density=density, alpha=0.5, bins=60, log=log, range=range_)

In [None]:
def show_corr(data, small=False, norm=False, delete_variance=True):
    dim=5
    if small:
        dim = 3
    fig = plt.figure(figsize=(dim,dim))
    corr_mat = data.corr()
    corr_mat[corr_mat.isnull()] = 0
    corr_mat = corr_mat.values
    if delete_variance:
        for i in range(data.shape[1]):
            corr_mat[i,i] = 0
    if norm==False:
        plt.imshow(corr_mat, cmap='bwr')
    else:
        plt.imshow(corr_mat, vmin=-1, vmax=1, cmap='bwr')
    plt.colorbar()

### Common functions both algorithms

In [None]:
def get_training_datas(datas, max_samples):
    data_sig = datas[1]
    data_bkg = datas[0].sample(frac=1).reset_index(drop=True)[:max_samples]
    data_train = data_sig.append(data_bkg, ignore_index=True).sample(frac=1).reset_index(drop=True)
    return data_train.iloc[:,:-1], data_train.iloc[:,-1]

In [None]:
def train_test_split_mod(datas, max_samples=1e10):
    data_sig = datas[1]
    data_bkg = datas[0]
    if data_sig.shape[0]>3000:
        data_sig = data_sig.sample(frac=1).reset_index(drop=True)[:max_samples]
    if data_bkg.shape[0]>3000:
        data_bkg = data_bkg.sample(frac=1).reset_index(drop=True)[:max_samples]
    data_tot = data_sig.append(data_bkg, ignore_index=True)
    
    x_train, x_test, y_train, y_test = train_test_split(data_tot.iloc[:,:-1], data_tot.iloc[:,-1],
                                                        test_size=0.7, shuffle=True,
                                                        stratify=data_tot.iloc[:,-1])
    x_train.index = np.arange(1,len(x_train)+1)
    y_train.index = np.arange(1,len(y_train)+1)
    x_test.index = np.arange(1,len(x_test)+1)
    y_test.index = np.arange(1,len(y_test)+1)
    return x_train, y_train, x_test, y_test

In [None]:
def optimize(x_train, x_test, optimized=False, useful=False):
    x_train_usf, x_test_usf = get_useful_features([x_train, x_test])
    if useful:
        return x_train_usf, x_test_usf

In [None]:
def prepare_data(datas, useful=False, optimized=False,
                 return_all_datas_optimized=False, return_all_datas_useful=False):
    data_bkg_usf, data_sim_sign_usf, data_lhcb_usf, data_compl_usf = get_useful_features(datas)
    datas_usf = [data_bkg_usf, data_sim_sign_usf, data_lhcb_usf, data_compl_usf]
    
    if return_all_datas_useful:
        return datas_usf
    if useful:
        return train_test_split_mod([data_bkg_usf, data_sim_sign_usf], max_samples=1300)
    
    cleaner = correlation_cleaner(above=0.8)
    cleaner.fit(data_lhcb_usf)
    
    data_bkg_opt, data_sim_sign_opt, data_lhcb_opt, data_compl_opt = get_uncorrelated_features(datas_usf, cleaner)
    datas_optimized = [data_bkg_opt, data_sim_sign_opt, data_lhcb_opt, data_compl_opt]
    
    if return_all_datas_optimized:
        return datas_optimized
    
    return train_test_split_mod([data_bkg_opt, data_sim_sign_opt], max_samples=1500)

In [None]:
def show_result(y_pred_prob, y_test, title=None, small=False, nn=False):
    if nn:
        y_pred_prob_hist = pd.DataFrame(y_pred_prob, columns=["Label"])
    else:
        y_pred_prob_hist = pd.DataFrame([prob[0] for prob in y_pred_prob], columns=["Label"])
    y_test_hist = pd.DataFrame(y_test.values, columns=["Label"])
    size = (5,5)
    if small:
        size = (3,3)
    fig = plt.figure(figsize=size)
    plt.hist(y_pred_prob_hist[(y_test_hist==1).values]["Label"], density=True, alpha=0.5, log=False)
    plt.hist(y_pred_prob_hist[(y_test_hist==0).values]["Label"], density=True, alpha=0.5, log=False)
    plt.title(title)
    return

In [None]:
def give_result(clf, x_test, y_test, nn=False):
    y_pred = clf.predict(x_test)
    print(y_pred.shape)
    print(y_test.shape)
    accuracy = accuracy_score(y_pred, y_test)
    print('Accuracy: \t', accuracy )
    y_pred_prob = clf.predict_proba(x_test)
    show_result(y_pred_prob, y_test, nn)
    return

### Best number of samples from the background

In [None]:
def get_plots_fraction_tot_samples(data1, data2, samples):
    best_forest_clf = RandomForestClassifier(n_estimators=300, max_features=3, bootstrap=True, n_jobs=-1)
    fig = plt.figure(figsize=(25,25))
    ind=0
    for sample in samples:
        ind+=1
        x_train_opt, y_train_opt, x_test_opt, y_test_opt = train_test_split_mod([data1, data2],
                                                                                max_samples=sample)
        best_forest_clf.fit(x_train_opt, y_train_opt)
        y_pred_opt = best_forest_clf.predict_proba(x_test_opt)
        show_result(y_pred_opt, y_test_opt, title=str(sample) + " samples of the background", small=True)