## 1. train_test_split

Нужно закодить свою версию train_test_split :)

In [771]:
import random

def my_train_test_split(X, y, test_size=0.2):

    indices = list(range(len(X)))

    test_indices = random.sample(indices, int(test_size * len(X)))

    train_indices = [idx for idx in indices if idx not in test_indices]

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

In [772]:
# проверка 
import numpy as np

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [2, 6]])
y = np.array([1, 0, 0, 0, 1])

X_train, X_test, y_train, y_test = my_train_test_split(X, y, test_size=0.4)

assert X_train.shape == (3, 2)
assert X_test.shape == (2, 2)
assert y_train.shape == (3,)
assert y_test.shape == (2,)


##  2. mae, mse, mape 

In [773]:
def my_mae_score(y_true, y_pred):
    mae = np.mean(np.abs(y_true - y_pred))
    return mae


In [774]:
def my_mse_score(y_true, y_pred):
    mse = np.mean(np.square(y_true - y_pred))
    return mse


In [775]:
def my_mape_score(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [776]:
######################################################
from sklearn import metrics
y_real = np.array([1,2,3,4,6])
y_pred = np.array([1,3,2,4,5])

mae = my_mae_score(y_real, y_pred)
mse = my_mse_score(y_real, y_pred)
mape = my_mape_score(y_real, y_pred)

eps = 0.0001
assert np.abs(mae - metrics.mean_absolute_error(y_real, y_pred)) < eps
assert np.abs(mse - metrics.mean_squared_error(y_real, y_pred)) < eps
assert np.abs(mape  - metrics.mean_absolute_percentage_error(y_real, y_pred)) < eps
######################################################


## 3. confusion matrix

In [777]:
def my_confusion_matrix(y_true, y_pred):
    tn, fp, fn, tp = 0, 0, 0, 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i] == 0:
            tn += 1
        elif y_true[i] == 0 and y_pred[i] == 1:
            fp += 1
        elif y_true[i] == 1 and y_pred[i] == 0:
            fn += 1
        elif y_true[i] == y_pred[i] == 1:
            tp += 1
    return [tn, fp, fn, tp]

In [778]:
#############
y_true = [0, 1, 0, 1, 0, 1, 0, 0, 1, 1]
y_pred = [0, 1, 1, 1, 0, 0, 1, 0, 1, 1]

tn, fp, fn, tp = my_confusion_matrix(y_true, y_pred)
assert tn == 3
assert fp == 2
assert fn == 1
assert tp == 4

###################


## 4. Кросс-валидация

4.1 Напишите свою версию K-Fold кросс валидации


In [779]:
def my_kfold(X, n_splits=5):
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    fold_sizes = np.full(n_splits, len(X) // n_splits, dtype=int)
    fold_sizes[:len(X) % n_splits] += 1
    current = 0
    list_of_indices = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        val_indices = indices[start:stop]
        train_indices = np.concatenate((indices[:start], indices[stop:]))
        list_of_indices.append([train_indices, val_indices])
        current = stop
    return list_of_indices
    # подсказка - возьмите индексы, перемешайте их и разделите на n_splits частей

    ### ╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ
    ## возвращает список из индексов для обучения и валидации
    ## например, для 5 фолдов:
    ## [[train_indices_1, val_indices_1], [train_indices_2, val_indices_2], ...]

4.2 Попробуем применить нашу кросс валидацию. 

Давайте прогоним ее на какой-нибудь задаче




In [780]:
import sklearn.datasets 

# скачиваем данные
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
X.shape, y.shape

((442, 10), (442,))

In [781]:
import sklearn.datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
X, y = sklearn.datasets.load_diabetes(return_X_y=True)

kf = KFold(n_splits=5)

mse_scores = []
mae_scores = []
mape_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse_scores.append(mean_squared_error(y_test, y_pred))
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    mape_scores.append(mean_absolute_percentage_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))

print("Средний MSE:", np.mean(mse_scores))
print("Средний MAE:", np.mean(mae_scores))
print("Средний MAPE:", np.mean(mape_scores))
print("Средний R-squared:", np.mean(r2_scores))

Средний MSE: 2993.081310469331
Средний MAE: 44.27649923321497
Средний MAPE: 0.3948603157216558
Средний R-squared: 0.48231643590864215


In [786]:
import sklearn.datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

X, y = sklearn.datasets.load_diabetes(return_X_y=True)

kf = KFold(n_splits=5)

mse_scores_baseline = []
for train_index, test_index in kf.split(X):
    y_train, y_test = y[train_index], y[test_index]

    # Предсказываем среднее значение y_train для всех объектов тестовой выборки
    y_pred = np.full(len(y_test), np.mean(y_train))

    mse_scores_baseline.append(mean_squared_error(y_test, y_pred))

print("Средний MSE (базовая модель):", np.mean(mse_scores_baseline))

Средний MSE (базовая модель): 5982.413413836098


## необязательное задание - ROC-AUC

In [787]:
def my_roc_auc_score(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]
    y_pred_sorted = y_pred[sorted_indices]

    # Рассчитываем количество положительных и отрицательных объектов
    n_positives = np.sum(y_true)
    n_negatives = len(y_true) - n_positives

    fpr = [0]
    tpr = [0]
    current_tpr = 0
    current_fpr = 0

    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 1:
            current_tpr += 1
        else:
            current_fpr += 1
        fpr.append(current_fpr / n_negatives)
        tpr.append(current_tpr / n_positives)

    roc_auc = np.trapz(tpr, fpr)

    return roc_auc

In [788]:
###########################
from sklearn.metrics import roc_auc_score

y_real = np.array([1,0,1,0,1])
y_pred = np.array([0.9,0.1,0.8,0.2,0.7])

score = my_roc_auc_score(y_real, y_pred)
assert np.abs(score - roc_auc_score(y_real, y_pred)) < 0.0001

###########################