In [254]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm import tqdm
from typing import *
from dataclasses import dataclass, asdict, field

plotly_margin = dict(l=0, r=0, t=0, b=0)

Обзор датасета

In [255]:
df_in = pd.read_csv("data.csv",
                    usecols=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'])
df_in

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


Работа с отсутствующими

In [256]:
df_in.isna().any()

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Cabin        True
Embarked     True
dtype: bool

In [257]:
df_in['Age'] = df_in['Age'].fillna(df_in['Age'].median())

In [258]:
labeled_cat_cols = ['Sex', 'Cabin', 'Embarked']
cat_cols = [*labeled_cat_cols, 'Pclass', 'Parch']
for i in labeled_cat_cols:
    df_in[i] = df_in[i].fillna(df_in[i].mode()[0])
    df_in[i] = df_in[i].astype('category').cat.codes

In [259]:
df_in

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.2500,47,2
1,1,1,0,38.0,1,0,71.2833,81,0
2,1,3,0,26.0,0,0,7.9250,47,2
3,1,1,0,35.0,1,0,53.1000,55,2
4,0,3,1,35.0,0,0,8.0500,47,2
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,47,2
887,1,1,0,19.0,0,0,30.0000,30,2
888,0,3,0,28.0,1,2,23.4500,47,2
889,1,1,1,26.0,0,0,30.0000,60,0


In [260]:
df_in.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208,53.639731,1.536476
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,23.568293,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,47.0,1.0
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,47.0,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,47.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,146.0,2.0


Проверим автокорреляции

In [261]:
corr = df_in.drop('Survived', axis=1).corr()
corr[corr == 1] = 0
px.imshow(corr, color_continuous_scale="Picnic")

In [262]:
df_in = df_in.drop(['Fare', 'Parch'], axis=1)

Нормализуем данные

In [263]:
df_n = (df_in - df_in.min()) / (df_in.max() - df_in.min())

Разделим тренировочную и тестовую выборку

In [264]:
train_size = 0.8

df_train = df_n.sample(frac=train_size, random_state=42)
df_test = df_n.drop(df_train.index)

tgt = 'Survived'
x_train, y_train = df_train.drop(tgt, axis=1), df_train[tgt]
x_test, y_test = df_test.drop(tgt, axis=1), df_test[tgt]

print(df_train.shape, df_test.shape)

(713, 7) (178, 7)


Метрики

In [265]:
def stat(y, y_pred):
    m = [
        [np.sum((y == r) & (y_pred == p)) for r in range(2)]
        for p in range(2)
    ]  # m[pred][true]
    return dict(
        tp=m[1][1],
        tn=m[0][0],
        fp=m[1][0],
        fn=m[0][1],
        n=y.shape[0],
        mat=m
    )


def accuracy(stats: dict):
    return (stats['tp'] + stats['tn']) / stats['n']


def precision(stats: dict):
    if (q := (stats['tp'] + stats['fp'])) == 0: return 0
    return stats['tp'] / q


def recall(stats: dict):
    if (q := (stats['tp'] + stats['fn'])) == 0: return 0
    return stats['tp'] / q


def f1(stats: dict):
    p, r = precision(stats), recall(stats)
    if (q := (p + r)) == 0: return 0
    return 2 * p * r / q


def TPR(stats: dict):
    return stats['tp'] / (stats['tp'] + stats['fn'])


def FPR(stats: dict):
    return stats['fp'] / (stats['fp'] + stats['tn'])

In [266]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def log_loss(y_true, y_pred, eps=1e-11):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))

Логистическая регрессия

In [273]:
@dataclass
class LRParams:
    rate: float = 0.01
    epochs: int = 100
    method: int = 0

    def as_np(self):
        return np.array([self.rate, self.epochs, self.method])

    @staticmethod
    def from_np(data):
        return LRParams(data[0], int(data[1]), int(data[2]))


class LogisticRegression:
    def __init__(self, config: Optional[LRParams] = None):
        self.config = config if config else LRParams()
        self.__methods = (self.__step_gd, self.__step_newton)
        self.__w = None

    def fit(self, x, y):
        step = self.__methods[self.config.method]

        x = np.hstack((np.ones((x.shape[0], 1)), x)) # bias
        self.__w = np.zeros(x.shape[1])

        train_loss_hist = []
        for _ in range(self.config.epochs):
            y_pred = sigmoid(np.dot(x, self.__w))
            step(x, y, y_pred)
            train_loss_hist.append(log_loss(y, y_pred))
                        
        return train_loss_hist

    def __step_gd(self, x, y, y_pred):
        n = x.shape[0]
        grad = np.dot(x.T, (y_pred - y)) / n    
        self.__w -= self.config.rate * grad

    def __step_newton(self, x, y, y_pred):
        n = x.shape[0]
        grad = np.dot(x.T, (y_pred - y)) / n
        hess = np.dot(x.T * y_pred * (1 - y_pred), x)
        delta = np.linalg.solve(hess, grad)
        self.__w -= delta

    def predict(self, x):
        x = np.hstack((np.ones((x.shape[0], 1)), x))
        out = sigmoid(np.dot(x, self.__w))
        return out.round().astype(int), out


In [274]:
def runner(params, gui=False):
    reg = LogisticRegression(params)
    loss_hist = reg.fit(x_train, y_train)
    if gui:
        px.line(loss_hist).show()

    y_pred, y_prob = reg.predict(x_test)
    stats = stat(y_test, y_pred)
    test_loss = log_loss(y_test, y_pred)

    return dict(
        accuracy=accuracy(stats),
        precision=precision(stats),
        recall=recall(stats),
        f1=f1(stats),
        train_loss=loss_hist[-1],
        test_loss=test_loss
    )

In [275]:
def comb(list2d):
    return np.array(np.meshgrid(*list2d)).T.reshape(-1, len(list2d))

In [276]:
var_rate = np.array([1, 0.1, 0.01, 0.001])
var_epoch = np.array([100, 500, 1000])
var_method = np.array([0, 1])
vars = comb([var_rate, var_epoch, var_method])

vars_out = []
for i in tqdm(vars):
    conf = LRParams.from_np(i)
    result = runner(conf)
    vars_out.append({**asdict(conf), **result})

results = pd.DataFrame(vars_out)
results

100%|██████████| 24/24 [00:18<00:00,  1.30it/s]


Unnamed: 0,rate,epochs,method,accuracy,precision,recall,f1,train_loss,test_loss
0,1.0,100,0,0.814607,0.75,0.738462,0.744186,0.465608,4.695721
1,1.0,500,0,0.803371,0.720588,0.753846,0.736842,0.445186,4.98031
2,1.0,1000,0,0.792135,0.705882,0.738462,0.721805,0.441738,5.2649
3,0.1,100,0,0.735955,0.95,0.292308,0.447059,0.558892,6.687845
4,0.1,500,0,0.808989,0.754098,0.707692,0.730159,0.481415,4.838016
5,0.1,1000,0,0.814607,0.75,0.738462,0.744186,0.465537,4.695721
6,0.01,100,0,0.634831,0.0,0.0,0.0,0.641314,9.249148
7,0.01,500,0,0.662921,1.0,0.076923,0.142857,0.59082,8.537675
8,0.01,1000,0,0.735955,0.95,0.292308,0.447059,0.558506,6.687845
9,0.001,100,0,0.634831,0.0,0.0,0.0,0.684946,9.249148


In [278]:
runner(LRParams(1, 100, 0), True)

{'accuracy': np.float64(0.8146067415730337),
 'precision': np.float64(0.75),
 'recall': np.float64(0.7384615384615385),
 'f1': np.float64(0.7441860465116279),
 'train_loss': np.float64(0.4656083252097935),
 'test_loss': np.float64(4.6957212777215895)}