# Машинное обучение. ЛР №2
### Журавлёв Константин, М8О-408Б-17

Импорт всех нужных библиотек

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from collections import Counter

##### Собственные классы для различных классификаторов.

In [2]:
class LogReg:
    def __init__(self, lr=0.05, iters=5000):
        self.learningRate = lr
        self.iterations = iters
    
    def intercept(self, X):
        return np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def lossFunction(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        X = self.intercept(X)
        self.w = np.zeros(X.shape[1])
        
        for i in range(self.iterations):
            z = np.dot(X, self.w)
            h = self.sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.w -= self.learningRate * gradient
            
        z = np.dot(X, self.w)
        h = self.sigmoid(z)
        loss = self.lossFunction(h, y)
            
    
    def predict_probability(self, X):
        X = self.intercept(X)
        return self.sigmoid(np.dot(X, self.w))
    
    def predict(self, X):
        return self.predict_probability(X).round()
    
    def __str__(self):
        return "LogReg.\n\t" + "LearningRate = " + str(self.learningRate)

In [3]:
class KNN:
    def fit(self, X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train

    def euclideanNorm(self, row, column):
        temp = (row-column)
        return np.sqrt((temp**2).sum())
        
    def predict(self, X_test):
        predictions = []
        for row in X_test:
            index = self.closest(row)
            predictions.append(index)
        return predictions

    def closest(self, row):
        bestDist = self.euclideanNorm(row, self.X_train[0])
        bestIndex = 0
        for i in range(1, len(self.X_train)):
            dist = self.euclideanNorm(row, self.X_train[i])
            if dist < bestDist:
                bestDist = dist
                bestIndex = i
        return self.Y_train[bestIndex]
    def __str__(self):
        return "KNN"

In [4]:
class Node():
    def __init__(self, pclass):
        self.predicted = pclass
        self.featureIdx = 0
        self.border = 0
        self.left = None
        self.right = None

class DecisionTree():
    def __init__(self, maxDepth = 1, rf = False):
        self.maxDepth = maxDepth
        self.rf = rf

    def fit(self, X, y):
        self.nClasses = len(set(y))
        self.features = X.shape[1]
        self.tree = self.UpdateTree(X, y)
    
    def Split(self, X, y):
        m = y.size    
        if m <= 1:
            return None, None
        parent = [np.sum(y == c) for c in range(self.nClasses)]
        bGini = 1.0 - sum((n / m) ** 2 for n in parent)
        bIdx, bThr = None, None
        for idx in range(self.features):
            borders, types = zip(*sorted(zip(X[:, idx], y)))
            left = [0] * self.nClasses
            right = parent.copy()
            for i in range(1, m):
                c = types[i - 1]
                right[c] -= 1
                left[c] += 1
                giniLeft = 1.0 - sum((left[x] / i) ** 2 for x in range(self.nClasses))
                giniRight = 1.0 - sum((right[x] / (m - i)) ** 2 for x in range(self.nClasses))
                gini = (i * giniLeft + (m - i) * giniRight) / m
                if borders[i] == borders[i - 1]:
                    continue
                if gini < bGini:
                    bGini = gini
                    bIdx = idx
                    bThr = (borders[i] + borders[i - 1]) / 2
        return bIdx, bThr

    def UpdateTree(self, X, y, depth = 0):
        samplesPerClass = [np.sum(y == i) for i in range(self.nClasses)]
        predType = np.argmax(samplesPerClass)
        node = Node(pclass = predType)
        if depth < self.maxDepth:      
            idx, thr = self.Split(X, y)
            if idx is not None:
                leftIdx = X[:, idx] < thr
                lx, ly = X[leftIdx], y[leftIdx]
                rx, ry = X[~leftIdx], y[~leftIdx]
                node.featureIdx = idx
                node.border = thr
                node.left = self.UpdateTree(lx, ly, depth + 1)
                node.right = self.UpdateTree(rx, ry, depth + 1)
        return node
    
    def predict(self, X):
        return [self.predictForSingleInput(Input) for Input in X]
    
    def predictForSingleInput(self, X):
        node = self.tree
        while node.left:
            if X[node.featureIdx] < node.border:
                node = node.left
            else:
                node = node.right
        return node.predicted    
    
    def __str__(self):
        return "DecisionTree.\n\tMaxDepth = " + str(self.maxDepth)

In [5]:
class RandomForest():
    def __init__(self, max_depth=5, n_estimators=100, max_features=None):
        self.maxDepth = max_depth
        self.maxFeatures = max_features
        self.nEstimators = n_estimators
        self.forest = [None] * n_estimators

    def fit(self, X, y):
        for i in range(self.nEstimators):
            self.forest[i] = DecisionTree(
                self.maxDepth, rf=True) 
            self.forest[i].fit(X, y)


    def predict(self, X):
        mostCommon = np.zeros(X.shape[0])
        preds = np.zeros((self.nEstimators, X.shape[0]))
        for i in range(self.nEstimators):
            preds[i] = self.forest[i].predict(X)
        for i in range(len(mostCommon)):
            mostCommon[i] = Counter(preds[:, i]).most_common(1)[0][0]
        return mostCommon.astype(int)
    
    def __str__(self):
        return "RandomForest.\n\tMaxDepth = " + str(self.maxDepth) + "\n\tEstimators = " + str(self.nEstimators) 

##### Создадим новую функцию для аггрегации метрик обучения двух методов.

In [6]:
import time
def CalcMetrics(Method1,Method2,X_train,Y_train,X_test,Y_test):
    s11 = time.clock()
    Method1.fit(X_train.values,Y_train.values)
    e11 = time.clock()
    s21 = time.clock()
    Method2.fit(X_train.values,Y_train.values)
    e21 = time.clock()
    
    s12 = time.clock()
    Y_pred1 = Method1.predict(X_test.values)
    Y_tr1 = Method1.predict(X_train.values)
    e12 = time.clock()
    s22 = time.clock()
    Y_pred2 = Method2.predict(X_test.values)
    Y_tr2 = Method2.predict(X_train.values)
    e22 = time.clock()
    
    
    precision1= precision_score(Y_pred1, Y_test, average = 'macro')
    recall1= recall_score(Y_pred1, Y_test, average = 'macro')
    trainAc1= accuracy_score(Y_tr1, Y_train)
    testAc1= accuracy_score(Y_pred1, Y_test)
    
    precision2= precision_score(Y_pred2, Y_test, average = 'macro')
    recall2= recall_score(Y_pred2, Y_test, average = 'macro')
    trainAc2= accuracy_score(Y_tr2, Y_train)
    testAc2= accuracy_score(Y_pred2, Y_test)
    
    print(Method1)
    print('Fit time = ', e11-s11)
    print('Prediction time = ', e12-s12)
    print("precision:", precision1)
    print("recall:", recall1)
    print("train_accuracy:", trainAc1)
    print("test_accuracy:", testAc1)
    
    print(Method2)
    print('Fit time = ', e21-s21)
    print('Prediction time = ', e22-s22)
    print("precision:", precision2)
    print("recall:", recall2)
    print("train_accuracy:", trainAc2)
    print("test_accuracy:", testAc2)

## Тестирование

### Шахматы

Начинаем тестирование на первом датасете - шахматы.
Из-за того, что классифицируем мы только на 2 класса, необходимо убрать из данных все вхождения партий с ничейным исходом, перенумеровать второй исход, а также последний столбец с объектным содержимим.

In [7]:
df = pd.read_csv('Chess_ready.csv')
df.head()
df.drop('increment_code',axis=1,inplace = True)
for i in range(df.index.size):
    if df.at[i,'winner']==1:
        df.drop([i],axis=0,inplace=True)
        continue
    if df.at[i,'winner']==2:
        df.at[i,'winner'] = 1
#print(df)
#df.reindex()

Создаём выборки для обучения и тестирования. Первая - полная, вторая - ограничена 1000 строками, т.к. некоторые методы работают слишком долго.
Примечание. Из-за того, что пришлось удалять строки из датасета, возможно получить ошибку out_of_bounds при обучении дерева решения и случайного леса на малой выборке.

In [None]:
train_set, test_set = train_test_split(df, test_size=0.2)
X_train = train_set.drop('winner', axis=1).astype('float32')
X_test = test_set.drop('winner', axis=1).astype('float32')
Y_train = train_set.winner
Y_test = test_set.winner

In [8]:
train_set, test_set = train_test_split(df[df.index <= 1000], test_size=0.3)
X_train = train_set.drop('winner', axis=1)
X_test = test_set.drop('winner', axis=1)
Y_train = train_set.winner
Y_test = test_set.winner

In [9]:
%%time
CalcMetrics(LogReg(lr=0.2),LogisticRegression(),X_train,Y_train,X_test,Y_test)

LogReg.
	LearningRate = 0.2
Fit time =  0.4972533460000008
Prediction time =  0.0005038219999988769
precision: 0.5
recall: 0.2719298245614035
train_accuracy: 0.5045317220543807
test_accuracy: 0.543859649122807
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
Fit time =  0.001524572999997531
Prediction time =  0.0004163370000043187
precision: 0.5
recall: 0.2719298245614035
train_accuracy: 0.5045317220543807
test_accuracy: 0.543859649122807
Wall time: 512 ms


In [10]:
%%time
CalcMetrics(KNN(),KNeighborsClassifier(n_neighbors=1),X_train,Y_train,X_test,Y_test)

KNN
Fit time =  0.00030234999999834145
Prediction time =  9.019446428999998
precision: 0.5254342431761787
recall: 0.52525376958707
train_accuracy: 1.0
test_accuracy: 0.5263157894736842
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Fit time =  0.0016835850000020969
Prediction time =  0.04968597999999247
precision: 0.5254342431761787
recall: 0.52525376958707
train_accuracy: 1.0
test_accuracy: 0.5263157894736842
Wall time: 9.08 s


In [11]:
%%time
CalcMetrics(DecisionTree(maxDepth = 5),DecisionTreeClassifier(max_depth=5),X_train,Y_train,X_test,Y_test)

DecisionTree.
	MaxDepth = 5
Fit time =  0.6388217869999977
Prediction time =  0.003912596000006374
precision: 0.5764267990074441
recall: 0.5776053215077606
train_accuracy: 0.6918429003021148
test_accuracy: 0.5824561403508772
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
Fit time =  0.0058164599999912525
Prediction time =  0.0010774599999905377
precision: 0.5887096774193548
recall: 0.5914322250639386
train_accuracy: 0.6918429003021148
test_accuracy: 0.5964912280701754
Wall time: 660 ms


In [12]:
%%time
CalcMetrics(RandomForest(),RandomForestClassifier(n_estimators=100, max_depth=5),X_train,Y_train,X_test,Y_test)

RandomForest.
	MaxDepth = 5
	Estimators = 100
Fit time =  61.45304100999999
Prediction time =  0.3098465480000243
precision: 0.5764267990074441
recall: 0.5776053215077606
train_accuracy: 0.6918429003021148
test_accuracy: 0.5824561403508772
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Fit time =  0.20463504400001398
Prediction time =  0.027835570999997117
precision: 0.6208436724565757
recall: 0.6256709331131296
train_accuracy: 0.797583081570997
test_accuracy: 0.6105263157894737
Wall time: 1min 2s


#### Выводы по первому датасету

Можно видеть, что написанные вручную методы по точности предсказаний не уступают методам из sklearn. Но вот временная эффективность на самой затратной для данного метода итерации (обучения или предсказания) отличается примерно в 100 (!!!) раз. Общий же для обоих типов сетей результат точности не выше 60% можно объяснить независимостью данных, и искомого признака - победы белых или чёрных. Это было отлично видно на тепловой карте в 1ой лабораторной, поэтому тут удивляться нечему.

### LoL

In [14]:
df2 = pd.read_csv('LoL_ready.csv')

In [None]:
train_set, test_set = train_test_split(df2, test_size=0.3)
X_train = train_set.drop('blueWins', axis=1)
X_test = test_set.drop('blueWins', axis=1)
Y_train = train_set.blueWins
Y_test = test_set.blueWins

In [15]:
train_set, test_set = train_test_split(df2[df2.index <= 1000], test_size=0.3)
X_train = train_set.drop('blueWins', axis=1)
X_test = test_set.drop('blueWins', axis=1)
Y_train = train_set.blueWins
Y_test = test_set.blueWins

In [16]:
%%time
CalcMetrics(LogReg(lr=0.2),LogisticRegression(),X_train,Y_train,X_test,Y_test)

LogReg.
	LearningRate = 0.2
Fit time =  0.5738362170000073
Prediction time =  0.0006577040000195211
precision: 0.5
recall: 0.26245847176079734
train_accuracy: 0.4785714285714286
test_accuracy: 0.5249169435215947
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
Fit time =  0.0023509770000202934
Prediction time =  0.0005411519999825032
precision: 0.5
recall: 0.23754152823920266
train_accuracy: 0.5214285714285715
test_accuracy: 0.4750830564784053
Wall time: 590 ms


In [17]:
%%time
CalcMetrics(KNN(),KNeighborsClassifier(n_neighbors=1),X_train,Y_train,X_test,Y_test)

KNN
Fit time =  0.000292661000003136
Prediction time =  9.995464465999987
precision: 0.5267327609099761
recall: 0.5267730496453901
train_accuracy: 1.0
test_accuracy: 0.5282392026578073
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Fit time =  0.0019691210000019055
Prediction time =  0.05165168299998868
precision: 0.5267327609099761
recall: 0.5267730496453901
train_accuracy: 1.0
test_accuracy: 0.5282392026578073
Wall time: 10.1 s


In [18]:
%%time
CalcMetrics(DecisionTree(maxDepth = 5),DecisionTreeClassifier(max_depth=5),X_train,Y_train,X_test,Y_test)

DecisionTree.
	MaxDepth = 5
Fit time =  1.6168206300000065
Prediction time =  0.003459497999983796
precision: 0.6506152075772329
recall: 0.6516488413547237
train_accuracy: 0.83
test_accuracy: 0.6478405315614618
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
Fit time =  0.009790038999994977
Prediction time =  0.00099624399999243
precision: 0.6474506506152076
recall: 0.648661311914324
train_accuracy: 0.83
test_accuracy: 0.6445182724252492
Wall time: 1.64 s


In [19]:
%%time
CalcMetrics(RandomForest(),RandomForestClassifier(n_estimators=100, max_depth=5),X_train,Y_train,X_test,Y_test)

RandomForest.
	MaxDepth = 5
	Estimators = 100
Fit time =  163.89184136800003
Prediction time =  0.380558528999984
precision: 0.6506152075772329
recall: 0.6516488413547237
train_accuracy: 0.83
test_accuracy: 0.6478405315614618
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Fit time =  0.22610701799999333
Prediction time =  0.03353718900001468
precision: 0.6990794016110472
recall: 0.6988505747126437
train_accuracy: 0.8457142857142858
test_accuracy: 0.6976744186046512
Wall time: 2min 44s


Результаты работы алгоритмов для второго датасета подтверждают всё, что было написано в выводах выше. Точность хорошая, но временные затраты просто космические. На данном примере видно, что, хоть результат (победа одной из команд) и зависел от имеющихся в датасете данных куда сильнее, чем в первом случае, эти закономерности смогли использовать только два метода: дерево решений и случайный лес, в то время как метрики KNN и логистической регрессии оказались даже хуже, чем в случае с шахматами.

## Выводы по всей работе

Для меня удивительными в данной работе оказались две противоположные вещи: 1. Собственные модели практически не уступали в точности предсказаний моделям и ScikitLearn. 2. Хоть я и ожидал, что время работы будет хуже, но чтобы в 100, а в худшем случае и почти в 500 раз медленнее.
Очевидно, что в моделях из пакета применялись различные оптимизации, но чтобы настолько сокращать время работы, это действительно удивительно. 