# Импорт данных

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
import math
from sklearn.model_selection import train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
df = df[df['target'] != 0]
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
50,7.0,3.2,4.7,1.4,1.0
51,6.4,3.2,4.5,1.5,1.0
52,6.9,3.1,4.9,1.5,1.0
53,5.5,2.3,4.0,1.3,1.0
54,6.5,2.8,4.6,1.5,1.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()
le.fit(df['target'])

LabelEncoder()

In [10]:
y = pd.Series(data=le.transform(df['target']))
y.head()

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [11]:
X = df.drop('target', axis = 1)

In [12]:
X = X.to_numpy()

In [13]:
y = y.to_numpy()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2, random_state=42)

# Реализация логистической регрессии вместе с градиентным спуском

In [15]:
def log_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred), axis=0) / len(y_true)



In [16]:
class LogReg:
    
    def __init__(self, x):
        self.x = x
        self.n = x.shape[1] #получаем количество фич для дальнейшего использования в формулах
        #задаем начальные веса параметров
        self.w = np.random.randn(self.n, 1) * 0.00001
        self.b = np.random.randn() * 0.00001
        
    def sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))

    #напишем функцию обучения модели с реализацией градиентного спуска
    def fit(self, X, y, lr, iters):
        
        for iteration in range(iters):            
            dw = np.zeros((self.n, 1))
            db = 0
            
            for i in range(len(X)):
                z = X[i].reshape(1, self.n)@self.w + self.b
                a = self.sigmoid(z)[0][0]
                
                dw += (a - y[i]) * X[i].reshape(self.n, 1)
                db += (a - y[i])
                
            dw /= len(X)
            db /= len(X)
            
            self.w = self.w - lr * dw
            self.b = self.b - lr * db
            
            

    def predict(self, X):        
        return np.array([self.sigmoid(i.reshape(1, self.n)@self.w + self.b)[0][0] 
                         for i in X])
    
    def accuracy(self, X, y):
        pred = np.array(self.predict(X))
        return np.sum((pred>=0.5) == y)/len(pred)
            
        
        
        

Реализуем обучение модели и посчитаем точность

In [17]:
LR = LogReg(X_train)
LR.fit(X_train, y_train, lr = 0.04, iters = 1000)

In [18]:
LR.accuracy(X_test, y_test)

0.9

Проверим реализацию градиентного спуска и передадим функции fit другие параметры

In [19]:
LR2 = LogReg(X_train)
LR2.fit(X_train, y_train, lr = 0.001, iters = 2000)

In [20]:
LR2.accuracy(X_test, y_test)

0.55

Реализация градиентного спуска работает, так как точность изменилась

## Scikit-learn

А теперь можно сравнить нашу модель с моделью библиотеки scikit-learn

In [21]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
predictions = logreg.predict(X_test)

In [22]:
logreg.score(X_test, y_test)

0.95

К сожалению, наша модель получилась менее точной, однако она работает, что уже хорошо