# Машинное обучение. Лабораторная работа №2.
Выполнила: *Девятерикова Александра Владимировна*  
Группа: *М8О-301Б-18*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('winequality-red.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Настройка датасета

In [3]:
design_matrix, target = data.drop(columns=['quality']).to_numpy(dtype=np.float128), np.array(data['quality'], 
                                                                                             dtype=np.float128)

Разделим входные данные на два класса:  
Первый класс $-$ качество вина более 5  
Второй класс $-$ качество вина не более 5

In [4]:
print(data[data['quality'] > 5].shape[0] / data['quality'].shape[0] * 100)
print(data[data['quality'] <= 5].shape[0] / data['quality'].shape[0] * 100)

for index in range(len(target)):
    if target[index] > 5:
        target[index] = 1
    else:
        target[index] = 0

53.47091932457786
46.52908067542214


Отнормируем входные признаки и разделим датасет на обучающую и тестовую выборки:

In [5]:
cnt_signs = design_matrix.shape[1]
max_values = np.zeros(cnt_signs)
for i in range(cnt_signs):
    max_in_clmn = design_matrix[:, i].max()
    max_values[i] = max_in_clmn
    design_matrix[:, i] /= max_in_clmn

features_train, features_test, target_train, target_test = train_test_split(design_matrix, target, 
                                                                            test_size=0.2, random_state=24)

## Логистическая регрессия

In [6]:
class MyLogisticRegression:
    
    def __init__(self, step=1e-1, n_iter=10000):
        self.step = step
        self.n_iter = n_iter
        
        
    def logistic_function(self, z):
        return 1 / (1 + np.exp(-z))
    
    
    def fit(self, X, Y):
        x = np.hstack((np.ones((X.shape[0], 1)), X))
        self._weights = np.zeros(x.shape[1])
        for i in range(self.n_iter):
            z = np.dot(x, self._weights)
            gradient = np.dot(x.T, self.logistic_function(z) - Y) / Y.size
            self._weights -= self.step * gradient
    
    
    def predict(self, X):
        x = np.array([X])
        x = np.hstack((np.ones((x.shape[0], 1)), x))
        return self.logistic_function(np.dot(x, self._weights)).round()
    
    
    def score(self, X, Y):
        right_predict_number = 0
        for i in range(Y.shape[0]):
            if self.predict(X[i]) == Y[i]:
                right_predict_number += 1

        return right_predict_number / Y.shape[0]

### Проверка точности и сравнение с реализацией sklearn

In [7]:
mlg = MyLogisticRegression()
mlg.fit(features_train, target_train)

In [8]:
print('Результат собственной реализации логистической регресси на обучающей выборке: {}%'
      .format(mlg.score(features_train, target_train) * 100))
print('Результат собственной реализации логистической регрессии на тестовой выборке: {}%'
      .format(mlg.score(features_test, target_test) * 100))

Результат собственной реализации логистической регресси на обучающей выборке: 74.90226739640345%
Результат собственной реализации логистической регрессии на тестовой выборке: 72.1875%


In [9]:
sklg = LogisticRegression()
sklg.fit(features_train, target_train)

LogisticRegression()

In [10]:
print('Результат sklearn реализации логистической регрессии: {}%'
      .format(sklg.score(features_test, target_test) * 100))

Результат sklearn реализации логистической регрессии: 72.1875%


##  Дерево решений

In [11]:
class Tree:
    class Node:
        def __init__(self, isLeaf=False, feature_index=None, split_value=None, sons=None, value=None):
            self.isLeaf = isLeaf
            if not isLeaf:
                self.feature_index = feature_index
                self.split_value = split_value
                self.sons = sons
                self.left = None
                self.right = None
            else:
                self.value = value

                
    def __init__(self, max_depth, min_size):
        self.max_depth = max_depth
        self.min_size = min_size
        
    
    def make_tree(self, XY):
        root = self.split(XY)
        self.recursive_split(root, 1)
        self.root = root
        
        
    def split(self, XY):
        unique_targets = list(set(row[-1] for row in XY))
        feature_index, split_value, subtrees = None, None, None
        min_gini_id = 100
        
        for i in range(XY.shape[1] - 1):
            for row in XY:
                s_trees = self.__get_subtrees(i, row[i], XY)
                gini_id = self.__gini_index(s_trees, unique_targets)
                if gini_id < min_gini_id:
                    feature_index, split_value, subtrees = i, row[i], s_trees
                    min_gini_id = gini_id
                    
        return Tree.Node(feature_index=feature_index, split_value=split_value, sons=subtrees)
                
                
    def __get_subtrees(self, index, value, XY):
        left, right = [], []
        for row in XY:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return np.array(left), np.array(right)
    
    
    def __gini_index(self, subtrees, unique_targets):
        n_instances_in_subtrees = sum([len(subtree) for subtree in subtrees])
        result = 0.
        for subtree in subtrees:
            n_instances_in_subtree = len(subtree)
            if n_instances_in_subtree == 0:
                continue
            score = 0.
            for target in unique_targets:
                probability = [row[-1] for row in subtree].count(target) / n_instances_in_subtree
                score += probability ** 2
            result += (1. - score) * (n_instances_in_subtree / n_instances_in_subtrees)
        return result
    
    
    def recursive_split(self, node, depth):
        left, right = node.sons
        node.sons = None
        if not left.tolist() or not right.tolist():
            node.left = node.right = self.make_leaf(left.tolist() + right.tolist())
            return
        if depth >= self.max_depth:
            node.left, node.right = self.make_leaf(left), self.make_leaf(right)
            return
        
        if len(left) <= self.min_size:
            node.left = self.make_leaf(left)
        else:
            node.left = self.split(left)
            self.recursive_split(node.left, depth + 1)
        
        if len(right) <= self.min_size:
            node.right = self.make_leaf(right)
        else:
            node.right = self.split(right)
            self.recursive_split(node.right, depth + 1)
            
    
    def make_leaf(self, subtree):
        targets = [row[-1] for row in subtree]
        return Tree.Node(isLeaf=True, value=max(targets, key=targets.count))
                

class MyDecisionTreeClassifier:
    
    def __init__(self, max_depth=7, min_size=3):
        self.tree = Tree(max_depth, min_size)
        
    
    def fit(self, X, Y):
        self.tree.make_tree(np.column_stack((X, Y)))
        
        
    def predict(self, x):
        return self.__predict_help(self.tree.root, x)
    
    
    def __predict_help(self, node, row):
        if row[node.feature_index] < node.split_value:
            if not node.left.isLeaf:
                return self.__predict_help(node.left, row)
            else:
                return node.left.value
        else:
            if not node.right.isLeaf:
                return self.__predict_help(node.right, row)
            else:
                return node.right.value         
    
    def score(self, X, Y):
        right_predict_number = 0
        for i in range(Y.shape[0]):
            if self.predict(X[i]) == Y[i]:
                right_predict_number += 1
                
        return right_predict_number / Y.shape[0]

### Проверка точности и сравнение с реализацией sklearn

In [12]:
mdt = MyDecisionTreeClassifier()
mdt.fit(features_train, target_train)
print('Результат собственной реализации дерева решений на обучающей выборке: {}%'
      .format(mdt.score(features_train, target_train) * 100))
print('Результат собственной реализации дерева решений на тестовой выборке: {}%'
      .format(mdt.score(features_test, target_test) * 100))

Результат собственной реализации дерева решений на обучающей выборке: 85.77013291634088%
Результат собственной реализации дерева решений на тестовой выборке: 71.25%


In [13]:
dt = DecisionTreeClassifier(max_depth=7)
dt.fit(features_train, target_train)
print('Результат sklearn реализации дерева решений: {}%'
      .format(dt.score(features_test, target_test) * 100))

Результат sklearn реализации дерева решений: 70.3125%


## Random forest

In [14]:
from sklearn.utils import resample
import random
from collections import Counter

class MyRandomForest:
    
    def __init__(self, max_depth=7, n_classifiers=30):
        self.max_depth = max_depth
        self.n_classifiers = n_classifiers
        self.classifiers = []
        self.features_indexes = []
        self.x, self.y = None, None
        
    
    def fit(self, X, Y):
        self.x, self.y = X, Y
        n_select_features = int(np.sqrt(self.x.shape[1]))
        for _ in range(self.n_classifiers):
            x, y = resample(self.x, self.y, n_samples=len(self.y))
            
            seq_features = [n for n in range(x.shape[1])]
            new_x = []
            for _ in range(n_select_features):
                val = random.choice(seq_features)
                new_x.append(val)
                seq_features.remove(val)
            self.features_indexes.append(new_x)
            tree = MyDecisionTreeClassifier(max_depth=self.max_depth)
            tree.fit(x[:,np.array(new_x)], y)
            self.classifiers.append(tree)
            
    
    def predict(self, X):
        pred = [classifier.predict(X[np.array(self.features_indexes[i])]) for i, classifier
                in enumerate(self.classifiers)]
        c = Counter(pred)
        return c.most_common(1)[0][0]
    
    
    def score(self, X, Y):
        right_predict_number = 0
        for i in range(Y.shape[0]):
            if self.predict(X[i]) == Y[i]:
                right_predict_number += 1
                
        return right_predict_number / Y.shape[0]

In [15]:
mrf = MyRandomForest(max_depth=5, n_classifiers=15)
mrf.fit(features_train, target_train)
print('Результат собственной реализации random forest на обучающей выборке: {}%'
      .format(mrf.score(features_train, target_train) * 100))
print('Результат собственной реализации random forest на тестовой выборке: {}%'
      .format(mrf.score(features_test, target_test) * 100))

Результат собственной реализации random forest на обучающей выборке: 80.68803752931977%
Результат собственной реализации random forest на тестовой выборке: 72.5%


In [16]:
dt = RandomForestClassifier(max_depth=5, n_estimators=15)
dt.fit(features_train, target_train)
print('Результат sklearn реализации random forest: {}%'
      .format(dt.score(features_test, target_test) * 100))

Результат sklearn реализации random forest: 74.375%
