Краткое описание данных:

    Date - Дата наблюдений
    Location - Название локации, в которой расположена метеорологическая станция
    MinTemp - Минимальная температура в градусах цельсия
    MaxTemp - Максимальная температура в градусах цельсия
    Rainfall - Количество осадков, зафиксированных за день в мм
    Evaporation - Так называемое "pan evaporation" класса А (мм) за 24 часа до 9 утра
    Sunshine - Число солнечных часов за день
    WindGustDir - направление самого сильного порыва ветра за последние 24 часа
    WindGustSpeed - скорость (км / ч) самого сильного порыва ветра за последние 24 часа
    WindDir9am - направление ветра в 9 утра

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import time
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib notebook

### Загрузка данных

In [2]:
X = pd.read_csv('weather.csv')
X.columns

Index(['Unnamed: 0', 'Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall',
       'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am',
       'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [3]:
X

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,145454,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,...,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No
142189,145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
142190,145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
142191,145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No


In [4]:
y = X.RainTomorrow.replace({'No': -1, 'Yes': 1})
del X['RainTomorrow']

In [5]:
y.sample(5)

71231   -1
5477    -1
32996   -1
66049   -1
54667   -1
Name: RainTomorrow, dtype: int64

## Работа с признаками

удаление лишних и изменение признаков

In [6]:
del X['Unnamed: 0']
data = pd.DataFrame({
    'day': pd.to_datetime(X.Date).dt.day,
    'year': pd.to_datetime(X.Date).dt.year,
    'month': pd.to_datetime(X.Date).dt.month})
X = X.join(data)
del X['Date']
X.RainToday = X.RainToday.replace({'No': 0, 'Yes': 1})
X.columns

Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'day', 'year', 'month'],
      dtype='object')

Выделение числовых данных в датасете

In [7]:
numeric_data = X.select_dtypes([np.number])

numeric_data_mean = numeric_data.mean()
numeric_features = numeric_data.columns

X = X.fillna(numeric_data_mean)
X[numeric_features].sample(5)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,day,year,month
129284,8.9,13.3,2.6,8.6,10.0,87.0,30.0,31.0,32.0,47.0,993.3,998.0,1.0,6.0,11.1,9.8,1.0,26,2015,11
130852,2.1,16.1,0.0,5.469824,7.624853,39.0,4.0,22.0,84.0,70.0,1011.4,1007.0,4.437189,4.503167,9.2,14.1,0.0,17,2011,9
23458,22.5,27.3,0.0,6.6,7.624853,48.0,24.0,22.0,74.0,78.0,1014.6,1012.9,8.0,8.0,25.4,24.3,0.0,8,2017,2
86377,23.5,29.5,1.0,3.8,5.5,54.0,39.0,37.0,69.0,62.0,1013.8,1011.0,5.0,7.0,27.3,28.0,0.0,1,2013,5
65401,2.1,15.1,0.0,1.8,8.1,72.0,39.0,31.0,81.0,52.0,1027.5,1024.5,1.0,3.0,9.0,14.9,0.0,17,2016,7


 Выделение категориальных данных в датасете

In [8]:
categorical = list(X.dtypes[X.dtypes == "object"].index)
X[categorical] = X[categorical].fillna("NotGiven")
X[categorical].sample(5)

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm
85769,Cairns,SE,S,SE
34106,SydneyAirport,WSW,N,N
108169,Albany,NotGiven,NotGiven,N
106186,Albany,NotGiven,NNW,NW
20929,NorfolkIsland,SE,SE,SE


Скалинг числовых данных и обработка категориальных признаков в числовые OneHot

In [9]:
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical),
    ('scaling', StandardScaler(), numeric_features)])
print("Size before OneHot:", X.shape)
X = pd.DataFrame(preprocessor.fit_transform(X).toarray())
print("Size after OneHot:", X.shape)

Size before OneHot: (142193, 24)
Size after OneHot: (142193, 120)


upsampling data

In [10]:
y.value_counts()

-1    110316
 1     31877
Name: RainTomorrow, dtype: int64

In [11]:
rat = len(X.loc[y == -1]) // len(X.loc[y == 1]) + 1

In [12]:
X = pd.concat([X,y], axis=1)
df_1 = X.loc[y == 1]
#X = pd.concat([X.loc[y == 0], df_1]).sample(frac=1)
df_1 = df_1.loc[df_1.index.repeat(rat)]
X = pd.concat([X.loc[y == -1], df_1]).sample(frac=1)
y = X.RainTomorrow
del X['RainTomorrow']

y.value_counts()

 1    127508
-1    110316
Name: RainTomorrow, dtype: int64

Добавление еденичного столбца для bias

In [13]:
X_numpy = X.to_numpy()
bias = np.ones(X.shape[0])
X_numpy = np.c_[X_numpy , bias ]
X = pd.DataFrame(X_numpy)

In [14]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.733380e-01,-1.508815,-4.219974e-16,0.558160,0.889680,0.000000,-0.990562,-1.085659,-1.284854,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.220966e+00,-0.191912,2.360587e-01,0.233693,0.234729,1.873642,-0.308598,-1.085659,-0.701167,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.654983e+00,0.000000,-4.219974e-16,-0.832415,-0.885964,-0.539050,-1.104223,1.275387,-0.117480,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.434058e-01,1.563959,1.661439e+00,0.079184,-0.638538,1.873642,-1.445205,-0.692151,-1.576698,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.443002e+00,-0.191912,7.111855e-01,-1.002374,-1.555469,1.873642,-0.763241,-0.692151,0.758051,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.851707e-01,0.247056,-1.189322e+00,-1.048727,-1.191608,1.873642,-0.081277,0.488372,-0.117480,1.0
237820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.595118e+00,-1.508815,-1.664449e+00,1.825128,2.199581,-0.539050,0.146044,-1.479166,1.341738,1.0
237821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.701449e-14,0.000000,-4.219974e-16,0.280045,0.074630,-0.539050,-1.445205,0.488372,1.341738,1.0
237822,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-5.325246e-01,0.000000,-4.219974e-16,-0.585202,-0.885964,1.873642,-0.422259,0.094864,0.466207,1.0


### Разделение датасета на таргет и тест

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.75 , random_state = 42)
#train_pct_index = int(0.75 * len(X))
#x_train, x_test = X[:train_pct_index], X[train_pct_index:]
#y_train, y_test = y[:train_pct_index], y[train_pct_index:]

### Реализация логистической регрессии
__Логистическая регрессия__

$$p(y|x) = a(x, \theta) = \sigma(\langle x, \theta \rangle) = \frac{1}{1 + \exp(-\langle \theta, x_i \rangle)}$$

In [16]:
def probability(theta, X):
    # YOUR CODE HERE
    result = 1 / (1 + np.exp(-np.dot(X,theta)))
    return result

Функция предсказания метки класса, получает на вход вероятности принадлежности к классу 1 и выдает метки классов $y \in \{0, 1\}$

In [17]:
def binary_class_prediction(theta, X, threshold =.5):
    prob =  probability(theta, X)
    def binary(x,threshold):
        if x < threshold:
            return -1
        else:
            return 1
    binary_vec = np.vectorize(binary)
    result = binary_vec(prob,threshold)
    return result

__Функционал качества логистической регрессии__

Запишем правдободовие выборки для меток класса $y \in \{+1, -1\}$ 

$$Likelihood(a, X^\ell) = \prod_{i = 1}^{\ell} a(x_i,\theta)^{[y_i = +1]} (1 - a(x_i, \theta))^{[y_i = -1]} → \operatorname*{max}_{\theta}$$ 

Прологарифмируем правдоподобие выборки и перейдем к задаче минимизации:

$$Q(a, X^\ell) =     -\sum_{i = 1}^{\ell} 
        [y_i = +1] \log a(x_i, \theta)
        +
        [y_i = -1] \log (1 - a(x_i, \theta)) \to \operatorname*{min}_{\theta}$$ 
        
Подставим $a(x, \theta)$ в функцинал качества:

$$ Q(a, X^\ell) = -\sum_{i = 1}^{\ell} \left(
    [y_i = +1]
    \log \frac{1}{1 + \exp(-\langle \theta, x_i \rangle)}
    +
    [y_i = -1]
    \log \frac{\exp(-\langle \theta, x_i \rangle)}{1 + \exp(-\langle \theta, x_i \rangle)}
\right)
=\\
=
-\sum_{i = 1}^{\ell} \left(
    [y_i = +1]
    \log \frac{1}{1 + \exp(-\langle \theta, x_i \rangle)}
    +
    [y_i = -1]
    \log \frac{1}{1 + \exp(\langle \theta, x_i \rangle)}
\right)
=\\
=
\sum_{i = 1}^{\ell}
    \log \left(
        1 + \exp(-y_i \langle \theta, x_i \rangle)
    \right) $$
    

Итоговый оптимизируемый функционал качества (logloss), записанный для меток классов $y \in \{+1, -1\}$ и усредненный по выборке

$$Q(a, X^\ell) = \frac{1}{\ell}\sum_{i = 1}^{\ell}
    \log \left(
        1 + \exp(-y_i \langle \theta, x_i \rangle)
    \right) \to \operatorname*{min}_{\theta}$$

Реализуем его в функции logloss:

In [18]:
def logloss(theta, X, y): 
    # YOUR CODE HERE
    result = np.sum(np.log(1 + np.exp(-y * np.dot(X,theta)))) / X.shape[0]
    return result

### Алгоритм оптимизации функционала качества. Стохастический градиентный спуск. Mini-batch.

<b>Вход: </b> Выборка $X^\ell$, темп обучения $h$

<b>Выход: </b> оптимальный вектор весов $\theta$

1.  Инициализировать веса $\theta$
2.  Инициализировать оценку функционала качества: $Q(a, X^\ell)$
3.  <b>Повторять</b>: 

    Выбрать случайным образом подвыборку объектов $X^{batch} =\{x_1, \dots,x_n \}$ из $X^{\ell}$
    
    Рассчитать градиент функционала качества: $\nabla Q(X^{batch}, \theta)$
    
    Обновить веса: $\theta := \theta - h\cdot \nabla Q(X^{batch}, \theta)$
       
    <b>Пока</b> значение $Q$ и/или веса $\theta$ не сойдутся   

Реализуем функцию рассчета градиента функционала качества

$$\frac{\partial Q(a, X^{batch}) }{\partial \theta_j}   = \frac{\partial \frac{1}{n}\sum_{i = 1}^{n}
    \log \left(
        1 + \exp(- y_i \langle \theta, x_i \rangle)
    \right)} {\partial \theta_j}  = \frac{1}{n}\sum_{i = 1}^{n}
     \frac {1}{
        1 + \exp(- y_i \langle \theta, x_i \rangle)} \cdot  \exp(- y_i \langle \theta, x_i \rangle) \cdot -y_i x_{ij}$$

Реализуйте рассчет градиента в матричном виде:

In [19]:
def gradient(theta, X, y):
    # YOUR CODE HERE
    result = np.array([])
    for i in range(len(theta)):
        result =  np.append(result,np.sum((1 / (1 + np.exp(-y * np.dot(X,theta)))) * np.exp(-y * np.dot(X,theta)) * -y*X[:,i])/ X.shape[0])
    return result

Функция обучения уже реализована

In [20]:
def fit(X, y, batch_size= 256, h = 0.5,  iters = 100, plot= False):

    # получаем размерности матрицы
    size, dim = X.shape

    # случайная начальная инициализация
    theta = np.random.uniform(size= dim)
    
    
    errors = []
    
    theta_history = theta
    colors = [plt.get_cmap('gist_rainbow')(i) for i in np.linspace(0,1,dim)]
    
    # plt 
    if plot:
        fig = plt.figure(figsize=(7,5))
        ax1 = fig.add_subplot(221)
        ax2 = fig.add_subplot(222)
        ax3 = fig.add_subplot(212)
        fig.suptitle('Gradient descent')
        
        
    for _ in range(iters):  
        
        # берём случайный набор элементов
        batch = np.random.choice(size, batch_size, replace=False)
        X_batch = X[batch]
        y_batch = y[batch]

        # считаем производные
        grad = gradient(theta, X_batch, y_batch)
        
        assert type(grad) == np.ndarray, 'неверный тип'
        assert len(grad.shape) == 1, 'Необходимо вернуть одномерный вектор'
        assert grad.shape[0] == len(theta), 'длина вектора должна быть равной количеству весов'

        # Обновляем веса
        
        theta -= grad * h
        
        theta_history = np.vstack((theta_history, theta))
        
        # error
        loss = logloss(theta, X, y)
        errors.append(loss)
        
        if plot:
            ax1.clear()            
            ax1.scatter(range(dim), theta, label='Gradient solution')
            ax1.legend(loc="upper left")
            ax1.set_title('theta')
            ax1.set_ylabel(r'$\bar \beta$')
            ax1.set_xlabel('weight ID')
            
            
            ax2.plot(range(_+1), errors, 'g-')
            ax2.set_title('logloss')
            ax2.set_xlabel('itarations')
            
            ax3.plot(theta_history)
            ax3.set_title('update theta')
            ax3.set_ylabel('value')
            ax3.set_xlabel('itarations')
           # time.sleep(0.05)
            fig.canvas.draw()   
            
    return theta

In [21]:
optimal_theta = fit(x_train.to_numpy(),y_train.to_numpy())

In [22]:
print(optimal_theta)

[ 2.74858223e-01  8.91967822e-02  1.32118614e-01  3.52817265e-01
  5.79286737e-01  3.08059316e-01  4.63600828e-01  6.07060117e-01
  4.01071995e-01  5.30755682e-02  7.99801840e-01  3.11799748e-01
  5.54977202e-01  6.49315595e-01  3.42458476e-02  3.53167390e-01
  2.61696577e-01  5.48291385e-01  6.12537007e-01  1.83507504e-01
  7.00569915e-01  8.80766908e-01  5.94743746e-01  7.35344506e-01
  5.19297959e-01  6.14605161e-01  2.89467431e-01  1.86665626e-01
  3.67767368e-01  3.83710517e-01  5.07831220e-01  4.52847449e-01
  6.02687941e-01  7.21244796e-01  8.66381535e-01  6.34743370e-01
  6.24088617e-01  2.43801205e-01  6.08404678e-01  5.88066729e-01
  5.07368737e-01  5.58840709e-02  3.80098629e-02  2.35650112e-01
  4.86484723e-01  4.62954669e-01  8.91582777e-01  6.43447192e-01
  7.00902265e-01  3.93686520e-01  1.30014467e-01  1.52725237e-02
  6.85971295e-01 -9.28161160e-02  4.56340150e-01  3.56561559e-01
  5.84794407e-01  1.92588531e-01  5.19812530e-01  3.11556679e-01
  3.70775757e-02  1.26769

In [23]:
y_train_pred = binary_class_prediction(optimal_theta, x_train)
y_pred = binary_class_prediction(optimal_theta, x_test)

In [24]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

          -1       0.75      0.76      0.75     27599
           1       0.79      0.78      0.78     31857

    accuracy                           0.77     59456
   macro avg       0.77      0.77      0.77     59456
weighted avg       0.77      0.77      0.77     59456



In [25]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

          -1       0.75      0.77      0.76     82717
           1       0.80      0.78      0.79     95651

    accuracy                           0.77    178368
   macro avg       0.77      0.77      0.77    178368
weighted avg       0.78      0.77      0.77    178368



# Реализация логистической регресси при помощи PyTorch

In [26]:
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch import optim

In [27]:
torch.__version__

'1.12.1+cpu'

![EX1_IMG1.jpg]

### Автоматическое дифференцирование

![EX1_IMG2.jpg](attachment:EX1_IMG2.jpg)

In [28]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [29]:
y_train = y_train.RainTomorrow.replace({-1: 0})
y_test = y_test.RainTomorrow.replace({-1: 0})

In [30]:
class LogisticRegressionTorch(nn.Module):
    def __init__(self, input_size, output_size):
        super(LogisticRegressionTorch, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, X):
        predictions = self.linear(X)
        return torch.sigmoid(predictions)

In [31]:
class Dataset(torch.utils.data.Dataset):
    """
    Our dataset
    """
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def row(self):
        return self.x.shape[1]
    
    def __getitem__(self, idx):
        return {"sample": torch.tensor(self.x[idx, :], dtype=torch.float), "target": self.y[idx]}

In [32]:
X_train = Dataset(x_train.to_numpy(dtype = "float32"), y_train.to_numpy(dtype = "float32"))

In [33]:
model = LogisticRegressionTorch(x_train.to_numpy().shape[1], 1)

# определяем функцию потерь — бинарную кросс-энтропию
loss = torch.nn.BCELoss()
# определяем алгоритм оптимизации Adam 
optimizer = optim.SGD(model.parameters(), lr= 0.5)

def fit(train, epochs = 10, mini_batch = 128):
    for epoch in range(epochs):
        train_dataloader = torch.utils.data.DataLoader(train, batch_size= mini_batch)
        for batch in train_dataloader:
            y_pred = model(batch['sample'])
            y_pred = y_pred.resize(y_pred.shape[0])
            output = loss(y_pred, batch['target'])
            # вычисляем градиенты
            output.backward()
            # обновляем параметры
            optimizer.step()
            optimizer.zero_grad()

In [34]:
fit(X_train)

In [35]:
model.eval()
y_pred_test = model(torch.tensor(x_test.to_numpy(dtype= "float32")))

In [36]:
def binary_class_predict(y, threshold =.5):
    def binary(y,threshold):
        if y < threshold:
            return 0
        else:
            return 1
    binary_vec = np.vectorize(binary)
    result = binary_vec(y,threshold)
    return result

In [37]:
y_pred = np.array([item for item in y_pred_test.detach().numpy()])
y_pred.resize(y_test.to_numpy().shape[0])
y_pred = binary_class_predict(y_pred)

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77     27599
           1       0.80      0.79      0.80     31857

    accuracy                           0.78     59456
   macro avg       0.78      0.78      0.78     59456
weighted avg       0.78      0.78      0.78     59456



In [39]:
x_tensor = torch.tensor(x_train.to_numpy(), dtype = torch.float32)
y_tensor = torch.tensor(y_train.to_numpy(), dtype = torch.float32)

In [40]:
X_train = Dataset(x_train.to_numpy(dtype = "float32"), y_train.to_numpy(dtype = "float32"))

In [41]:
# Обучение
def fit(train : Dataset , epochs : int = 10, mini_batch : int = 128 , lr : torch.float32 = 0.1) -> torch.Tensor :
    weight = torch.tensor(np.random.uniform(size= train.row()) , dtype = torch.float32 , requires_grad = True)
    for epoch in range(epochs):
        train_dataloader = torch.utils.data.DataLoader(train, batch_size = mini_batch)
        for batch in train_dataloader:
            # forward
            predictions = torch.nn.functional.sigmoid(batch['sample'] @ weight)
            loss = torch.nn.functional.binary_cross_entropy(predictions, batch['target'])
            # backward  — вычисляем градиент
            loss.backward()
            # вычитаем производные из параметров
            # записывать историю вычислений уже не нужно (no_grad)
            with torch.no_grad():
                weight.data -= weight.grad * lr
                # обнуляем производные
                weight.grad.data.zero_()
    return weight.data
# Запуск
def predict(X_test : torch.Tensor , weight : torch.Tensor):
     y_pred = torch.nn.functional.sigmoid(X_test @ weight)
     return torch.where( y_pred > 0.5, 1, 0 )

In [42]:
w = fit(X_train)

In [43]:
w

tensor([ 1.0613,  0.0960,  0.8682,  0.3124,  0.6723,  0.1340,  0.5835,  0.8587,
         0.4007,  0.5555,  0.7101,  0.4078,  0.5678, -0.2824,  0.1886, -0.0390,
        -0.0145,  0.2080,  0.4278, -0.0568,  0.4594,  0.4063,  0.4921, -0.3827,
         0.1378,  0.3434, -0.0067,  0.1904,  0.4964,  0.7734,  0.8745,  0.9582,
         0.7368,  0.5062,  0.5538,  0.1299,  0.8800,  0.4727,  0.3560, -0.3922,
         0.7131,  0.3319,  0.6894,  0.5797,  0.2061,  0.8306,  0.9495, -0.3023,
         0.3099,  0.3371,  0.2537,  0.4512,  0.4006,  0.2237,  0.2018,  0.4242,
         0.3668,  0.5784,  0.4903,  0.4848,  0.4464,  0.4042,  0.4067,  0.2891,
         0.2836,  0.3366,  0.2617,  0.5083,  0.1853,  0.5354,  0.5468,  0.6893,
         0.4089,  0.4417,  0.3030,  0.1010,  0.1823,  0.0734,  0.3073,  0.4385,
         0.4320,  0.5080,  0.4210,  0.2078,  0.2731,  0.1571,  0.5667,  0.1429,
         0.4142,  0.6892,  0.6664,  0.7895,  0.3177,  0.2135,  0.1898,  0.2397,
         0.1377,  0.3000,  0.4919,  0.23

In [44]:
y_pred = predict(torch.tensor(x_test.to_numpy(dtype= "float32")),w)

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77     27599
           1       0.80      0.80      0.80     31857

    accuracy                           0.79     59456
   macro avg       0.79      0.79      0.79     59456
weighted avg       0.79      0.79      0.79     59456



# Логистистическая регрессия при помощи sklearn.linear_model.LogisticRegression

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77     27599
           1       0.80      0.81      0.80     31857

    accuracy                           0.79     59456
   macro avg       0.79      0.79      0.79     59456
weighted avg       0.79      0.79      0.79     59456



# Метод ближайших соседий при помощи sklearn.neighbors

In [48]:
from sklearn.neighbors import KNeighborsClassifier

In [49]:
Knn = KNeighborsClassifier(n_neighbors = 3)
Knn.fit(x_train, y_train)
y_pred = Knn.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.76      0.84     27599
           1       0.82      0.96      0.89     31857

    accuracy                           0.87     59456
   macro avg       0.88      0.86      0.86     59456
weighted avg       0.88      0.87      0.87     59456



# Байесовский классификатор

In [50]:
from sklearn.naive_bayes import GaussianNB