In [3]:
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Модель логистической регрессии

### Обучаем модель и смотрим точность

In [11]:
cancer = datasets.load_breast_cancer()
logistic_regression = LogisticRegression()
model = logistic_regression.fit(cancer.data, cancer.target)
print('Accuracy: {:.2f}'.format(model.score(cancer.data, cancer.target)))

Accuracy: 0.95


### Смотрим разные качественные характеристики

In [6]:
predictions = model.predict(cancer.data)
print('Accuracy: {:.2f}'.format(metrics.accuracy_score(cancer.target, predictions)))
print('ROC AUC: {:.2f}'.format(metrics.roc_auc_score(cancer.target, predictions)))
print('F1: {:.2f}'.format(metrics.f1_score(cancer.target, predictions)))

Accuracy: 0.95
ROC AUC: 0.94
F1: 0.96


### Делим выборку и проверяем насколько переобучилась

In [7]:
from sklearn.model_selection import train_test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
cancer.data, cancer.target,
test_size=0.2, random_state=12
)
model = logistic_regression.fit(x_train, y_train)
print('Train accuracy: {:.2f}'.format(model.score(x_train, y_train)))
print('Test accuracy: {:.2f}'.format(model.score(x_test, y_test)))

Train accuracy: 0.95
Test accuracy: 0.93


# Применение различных регуляризаций в регрессии

In [24]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
boston = datasets.load_boston()
lasso = Lasso() # L1
ridge = Ridge() # L2
elastic = ElasticNet() # L1 + L2
for model in [lasso, ridge, elastic]:
    x_train, x_test, y_train, y_test = train_test_split(\
    cancer.data, cancer.target,
    test_size=0.2
)
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    print(model.__class__)
    print('MSE: {:.2f}\n'.format(metrics.mean_squared_error(y_test, predictions)))

<class 'sklearn.linear_model._coordinate_descent.Lasso'>
MSE: 0.12

<class 'sklearn.linear_model._ridge.Ridge'>
MSE: 0.07

<class 'sklearn.linear_model._coordinate_descent.ElasticNet'>
MSE: 0.09



# Применение различных регуляризаций в регрессии

In [18]:
from sklearn.model_selection import KFold, cross_val_score
iris = datasets.load_iris()
iris.keys()
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

### Делим выборку на 5 частей

In [21]:
logistic_regression = LogisticRegression()
cv = KFold(n_splits=5) # +StratifiedKFold

### Последовательно обучаем и оцениваем модель на каждом из 5 модмножеств данных

In [22]:
for split_idx, (train_idx, test_idx) in enumerate(cv.split(iris.data)):
    x_train, x_test = iris.data[train_idx], iris.data[test_idx]
    y_train, y_test = iris.target[train_idx], iris.target[test_idx]
    logistic_regression.fit(x_train, y_train)
    score = logistic_regression.score(x_test, y_test)
    print('Split {} Score: {:.2f}'.format(split_idx, score))

Split 0 Score: 1.00
Split 1 Score: 1.00
Split 2 Score: 0.87
Split 3 Score: 0.93
Split 4 Score: 0.83


### Встроенная функция обучения на разбитых данных (тоже что сверху, но не руками)

In [23]:
cv_score = cross_val_score(
    logistic_regression, iris.data, iris.target,
    scoring='accuracy'### Делим выборку на 5 частей, cv=cv
)
print('Cross val score: {}'.format(cv_score))
print('Mean cross val score: {:.2f}'.format(cv_score.mean()))

Cross val score: [1.         1.         0.86666667 0.93333333 0.83333333]
Mean cross val score: 0.93
