In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import time
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_excel("Устойчивост эл_сети.xlsx",engine='openpyxl',sheet_name='Data_for_UCI_named')

In [3]:
df.tail()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.28588,0.36612,-0.025803,stable
9997,2.364034,2.84203,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.03181,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.96633,-0.649915,-0.89851,0.365246,0.587558,0.889118,0.818391,0.037789,unstable
9999,6.530527,6.78179,4.349695,8.673138,3.492807,-1.390285,-1.532193,-0.570329,0.073056,0.505441,0.378761,0.942631,0.045263,unstable


In [4]:
d = {'unstable': 0, 'stable': 1}
df.stabf = df.stabf.map(d)

In [5]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,0
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,0
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,0


In [6]:
X = df.drop(['stab', 'stabf'], axis=1).values
y = df.stabf.values

In [7]:
for i in range(X.shape[1]):
    X[:,i]=(X[:,i]-X[:,i].min())/(X[:,i].max()-X[:,i].min())

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [9]:
1 - np.array([0, 1, 0, 0, 1])

array([1, 0, 1, 1, 0])

In [None]:
class Seniority_committee:
    """
        ML Algo based on the seniority committee method
    """
    
    def __init__(self, N):
        """
            :param N: число наблюдений, находящихся выше гиперплоскости
        """
        
        self.X_train = None
        self.y_train = None
        self.L = -1
        self.N = N
        self.weights_hp = []
        
    def probability(X, w):
        """
            Принимает на вход матрицу фичей и вектор весов
            Возвращает предсказание вероятность того, что y = 1 при фиксированных x, P(y=1|x)

            :param X: матрица признаков ## расширенная матрица фичей [n_samples,6] (expanded)
            :param w: вектор весов ## [6]
            :returns: вероятность того, что y = 1 при фиксированных x, P(y=1|x) ## вектор вероятностей = ReLU1(X.T*w)
        """
        
        linear = np.dot(X, w)
        linear[linear < 0] = 0
        linear[linear > 1] = 1
        
        return linear
    
    def compute_loss(X, y, w):
        """
            Принимает на вход матрицу весов, вектор ответов, вектор весов и параметр L, 
            влияющий на долю класса 0 в отсекающей гиперплоскости.
            Выдаёт на выход значение функции потерь ## расчитанное по формуле выше.
            
            :param X: матрица признаков
            :param w: вектор целевой переменной
            :param w: вектор весов
            :returns: значение функции потерь
        """
        
        p1 = probability(X, w)
        loss = np.sum((self.L - (self.L + 1) * y) * p1)
        
        return loss
    
    def compute_train_loss_class_0(w):
        """
            Function that we want to minimize, the committee member votes for class 1
            
            :param w: вектор весов
            :returns: значение функции потерь на обучающей выборке
        """
        
        if self.X_train is None or self.y_train is None:
            raise Exception('Model is not fitted')
        
        return compute_loss(self.X_train, 1 - self.y_train, w)
    
    def compute_train_loss_class_1(w):
        """
            Function that we want to minimize, the committee member votes for class 1
            
            :param w: вектор весов
            :returns: значение функции потерь на обучающей выборке
        """
        
        if self.X_train is None or self.y_train is None:
            raise Exception('Model is not fitted')
        
        return compute_loss(self.X_train, self.y_train, w)
    
    def make_hyperplane(class_num, X_train, number_of_hyperplane, c=0.1, cycle_range=100, disp=False, \
                        adaptive=True, maxiter=None, xatol=None):
        """
            Function that makes one of three hyperplanes
            
            :param class_num: класс, за который голосует данный член комитета: [0, 1]
            :param X_train: матрица признаков обучающей выборки
            :param number_of_hyperplane: порядковый номер гиперплоскости: [1, 2, 3]
            :param с: вспомогательный коэффициент для выбора начального приближения
            :param cycle_range: количество итераций минимизации функции потерь
            Параметры оптимизации с помощью алгоритма Нелдера-Мида:
                :param disp: bool: печать сообщения о сходимости
                :param adaptive: bool: адаптация параметров алгоритма для размерности задачи (полезно при больших размерностях)
                :param maxiter: максимально допустимое количество итераций при оптимизации
                :param xatol: абсолютная ошибка на оптимальных точках между итерациями, приемлемая для сходимости
            :returns: значение функции потерь на тестовой выборке
        """
        
        if self.X_train is None or self.y_train is None:
            raise Exception('Using make_hyperplane method before fitting')
        if number_of_hyperplane not in [1, 2, 3]:
            raise Exception('You can only make hyperplane number 1, 2 or 3')
        if class_num not in [0, 1]:
            raise Exception('Only binary classification is available, class_num should be 0 or 1')
        
        optim_result = []
        start_time = time.time()
        if number_of_hyperplane == 1:

            for i in range(cycle_range):
                start_w = np.array([(np.random.rand(X_train.shape[1]) - 0.5) * c])
                if class_num == 1:
                    res = minimize(compute_train_loss_class_1, x0=start_w, method='Nelder-Mead', \
                                   options={'disp': disp,'adaptive': adaptive, 'maxiter': maxiter, 'xatol': xatol})
                elif class_num == 0:
                    res = minimize(compute_train_loss_class_0, x0=start_w, method='Nelder-Mead', \
                                   options={'disp': disp,'adaptive': adaptive, 'maxiter': maxiter, 'xatol': xatol})
                optim_result += [[i, res.fun, res.x]]

            optim_result = pd.DataFrame(optim_result)
            optim_result = optim_result.sort_values(1).head(1)
            hyperplane_coefficients = optim_result[2].values[0]
            print('Time taken for optimization: {0}'.format(time.time() - start_time))
            print('The best result was on the step {0}'.format(optim_result[0].values[0]))
            print('The minimum of the loss function: {0}'.format(optim_result[1].values[0]))

#             self.weights_hp_1 = hyperplane_coefficients
            return hyperplane_coefficients

        elif number_of_hyperplane == 2:

            for i in range(cycle_range):
                start_w = np.array((np.random.rand(X_train.shape[1]) - 0.5) * c)
                if class_num == 1:
                    res = minimize(compute_train_loss_class_1, x0=start_w, method='Nelder-Mead', \
                                   options={'disp': disp,'adaptive': adaptive, 'maxiter': maxiter, 'xatol': xatol})
                elif class_num == 0:
                    res = minimize(compute_train_loss_class_0, x0=start_w, method='Nelder-Mead', \
                                   options={'disp': disp,'adaptive': adaptive, 'maxiter': maxiter, 'xatol': xatol})
                optim_result += [[i, res.fun, res.x]] # , start_loss, start_w

            optim_result = pd.DataFrame(optim_result)
            optim_result = optim_result.sort_values(1).head(1)
            hyperplane_coefficients = optim_result[2].values[0]
            print('Time taken for optimization: {0}'.format(time.time() - start_time))
            print('The best result was on the step {0}'.format(optim_result[0].values[0]))
            print('The minimum of the loss function: {0}'.format(optim_result[1].values[0]))

#             self.weights_hp_2 = hyperplane_coefficients
            return hyperplane_coefficients

        elif number_of_hyperplane == 3:

            optim_result_new = []

            for i in range(cycle_range):
                start_w = np.array((np.random.rand(X_test.shape[1])-0.5)*c)
                if class_num == 1:
                    res = minimize(compute_train_loss_class_1, x0=start_w, method='Nelder-Mead', \
                                   options={'disp': disp,'adaptive': adaptive, 'maxiter': maxiter, 'xatol': xatol})
                elif class_num == 0:
                    res = minimize(compute_train_loss_class_0, x0=start_w, method='Nelder-Mead', \
                                   options={'disp': disp,'adaptive': adaptive, 'maxiter': maxiter, 'xatol': xatol})
                if res.fun < 0: # ??? - вообще не универсально      
                    optim_result.append(res.x)

            # Имеет ли большой смысл следующее???
            for start_w_new in optim_result:
                if class_num == 1:
                    res = minimize(compute_train_loss_class_1, x0=start_w_new, method='Nelder-Mead', \
                             options={'disp': False, 'adaptive':True, 'xatol':1})
                elif class_num == 0:
                    res = minimize(compute_train_loss_class_0, x0=start_w_new, method='Nelder-Mead', \
                             options={'disp': False, 'adaptive':True, 'xatol':1})

                for k in range(2):
                    if class_num == 1:
                        res = minimize(compute_train_loss_class_1, x0=res.x, method='Nelder-Mead', \
                                 options={'disp': False, 'adaptive':True})
                    elif class_num == 0:
                        res = minimize(compute_train_loss_class_0, x0=res.x, method='Nelder-Mead', \
                                 options={'disp': False, 'adaptive':True})

                optim_result_new += [[res.fun, res.x]] # , start_w

            optim_result_new = pd.DataFrame(optim_result_new)
            optim_result = optim_result.sort_values(0).head(1)
            hyperplane_coefficients = optim_result[1].values[0]
            print('Time taken for optimization: {0}'.format(time.time() - start_time))
#                 print('The best result was on the step {0}'.format(optim_result[0].values[0]))
            print('The minimum of the loss function: {0}'.format(optim_result[0].values[0]))

#             self.weights_hp_3 = hyperplane_coefficients
            return hyperplane_coefficients
        
    def cutter(X, w):
        """
            Function that makes binary targets for rational numbers
            
            :param X: матрица признаков
            :param w: вектор оптимальных весов
            :returns: бинаризованные предсказания целевой переменной
        """
        linear = np.dot(X, w)
        linear[linear < 0] = 0
        return np.sign(linear)
    
    # TODO:
    def fit_predict(X, y, optim_params):
        """
            Fit the algorithm and predict the target on the train sample 
            
            :param X: матрица признаков
            :param y: вектор истинных значений целевой переменной
            :param optim_params: dict: словарь параметров оптимизатора 
            :returns: предсказания таргетов на оучающей выборке
        """
        
        self.X_train = X
        self.y_train = y
#         preds = np.zeros(len(y))
#         train_preds = pd.DataFrame(y, columns=['TARGET'])
#         train_preds['PEDICTIONS'] = 0
        
        for hp_num in range(1, 4):
            k = 10
#             X_1 = np.array([[]])
            while True:
                self.L = 2 ** k
                hp_weights_class_1 = make_hyperplane(class_num=1, self.X_train, number_of_hyperplane=hp_num, \
                                             c=0.1, cycle_range=100, disp=False, \
                                            adaptive=True, maxiter=None, xatol=None)
                hp_weights_class_0 = make_hyperplane(class_num=0, self.X_train, number_of_hyperplane=hp_num, \
                                             c=0.1, cycle_range=100, disp=False, \
                                            adaptive=True, maxiter=None, xatol=None)
                cut_1 = cutter(self.X_train, hp_weights_class_1)
                cut_0 = cutter(self.X_train, hp_weights_class_0)
                X_1 = self.X_train[cut_1 == 1]
                X_0 = self.X_train[cut_0 == 1]
                if X_1.shape[0] < self.N and X_0.shape[0] < self.N:
                    k -= 1
                    continue
                else:
                    if X_1.shape[0] > X_0.shape[0]:
                        self.weights_hp.append(hp_weights_class_1)
#                         X_class_1 = self.X_train[cut_1 == 1]
#                         preds[cut_1 == 1] = probability(X_class_1, hp_weights_class_1)
                        
                        self.X_train = self.X_train[cut_1 == 0]
                        self.y_train = self.y_train[cut_1 == 0]
                        
                    else X_1.shape[0] > X_0.shape[0]:
                        self.weights_hp.append(hp_weights_class_0)
#                         X_class_0 = self.X_train[cut_0 == 0]
#                         preds[cut_0 == 0] = probability(X_class_0, hp_weights_class_0)
                        
                        self.X_train = self.X_train[cut_0 == 1]
                        self.y_train = self.y_train[cut_0 == 1]
                    break
        
        
    

In [1]:
weights_hp = dict()
weights_hp == {}

True