In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'retina'
import math
import pickle

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from collections import Counter
from scipy.stats import mode

In [2]:
class KNN():
    def __init__(self, nbrs=5):
        self.nbrs = nbrs

    def fit(self, X_t, y_t):
        self.X_t = X_t.to_numpy()
        self.y_t = y_t.to_numpy()

    def dists(self, X_p):
        t = np.dot(X_p, self.X_t.transpose())
        dists = np.sqrt(-2 * t + np.square(self.X_t).sum(1) + np.matrix(np.square(X_p).sum(1)).T)
        return dists

    def predict(self, X_p):
        dists = self.dists(X_p.to_numpy())
        preds = np.zeros(dists.shape[0])
        for i in range(dists.shape[0]):
            labels = self.y_t[np.argsort(dists[i,:])].flatten()
            top_nn_y = labels[:self.nbrs]
            preds[i] = Counter(top_nn_y).most_common(1)[0][0]
        return preds

In [3]:
class NBC():
    def __init__(self):
        pass

    def fit(self, X_t, y_t):
        X_t = X_t.to_numpy()
        y_t = y_t.to_numpy()
        self.num_of_classes = np.max(y_t) + 1
        self.priorities = np.bincount(y_t) / len(y_t)
        self.Ms = np.array([X_t[np.where(y_t == i)].mean(axis=0) for i in range(self.num_of_classes)])
        self.stds = np.array([X_t[np.where(y_t == i)].std(axis=0) for i in range(self.num_of_classes)])
        return self

    def predict(self, X_p):
        X_p = X_p.to_numpy()
        res = []
        for i in range(len(X_p)):
            Ps = []
            for j in range(self.num_of_classes):
                Ps.append((1 / np.sqrt(2 * np.pi * self.stds[j]**2) * np.exp(-0.5*((X_p[i] - self.Ms[j]) / self.stds[j])**2)).prod() * self.priorities[j])
            Ps = np.array(Ps)
            res.append(Ps / Ps.sum())
        return np.array(res).argmax(axis=1)

In [4]:
class LR():
    def __init__(self, lr=0.01, steps=5000):
        self.lr = lr
        self.steps = steps

    def s(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        self.coefs = np.zeros(X.shape[1])

        for _ in range(self.steps):
            h = self.s(np.dot(X, self.coefs))
            self.coefs -= self.lr * \
                          np.dot(X.T, (h - y)) / y.size

    def predict(self, X):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return self.s(np.dot(X, self.coefs)).round()

In [5]:
def add_bias_feature(a):
    a_extended = np.zeros((a.shape[0],a.shape[1]+1))
    a_extended[:,:-1] = a
    a_extended[:,-1] = int(1)  
    return a_extended

class SVM(BaseEstimator, ClassifierMixin):
    def __init__(self, etha=0.01, alpha=0.1, epochs=200):
        self._epochs = epochs
        self._etha = etha
        self._alpha = alpha
        self._w = None
        self.history_w = []
        self.train_errors = None
        self.val_errors = None
        self.train_loss = None
        self.val_loss = None

    def fit(self, X_train, Y_train, X_val, Y_val, verbose=False):
        X_train, Y_train, X_val, Y_val = X_train.to_numpy(), Y_train.to_numpy(), X_val.to_numpy(), Y_val.to_numpy()


        if len(set(Y_train)) != 2 or len(set(Y_val)) != 2:
            raise ValueError("Number of classes in Y is not equal 2!")

        X_train = add_bias_feature(X_train)
        X_val = add_bias_feature(X_val)
        self._w = np.random.normal(loc=0, scale=0.05, size=X_train.shape[1])
        #self.history_w.append(self._w)
        np.append(self.history_w, self._w)
        train_errors = []
        val_errors = []
        train_loss_epoch = []
        val_loss_epoch = []

        for epoch in range(self._epochs): 
            tr_err = 0
            val_err = 0
            tr_loss = 0
            val_loss = 0
            for i,x in enumerate(X_train):
                margin = Y_train[i]*np.dot(self._w,X_train[i])
                if margin >= 1:
                    self._w = self._w - self._etha*self._alpha*self._w/self._epochs
                    tr_loss += self.soft_margin_loss(X_train[i],Y_train[i])
                else:
                    self._w = self._w +\
                    self._etha*(Y_train[i]*X_train[i] - self._alpha*self._w/self._epochs)
                    tr_err += 1
                    tr_loss += self.soft_margin_loss(X_train[i],Y_train[i])
                #self.history_w.append(self._w)
                np.append(self.history_w, self._w)
            for i,x in enumerate(X_val):
                val_loss += self.soft_margin_loss(X_val[i], Y_val[i])
                val_err += (Y_val[i]*np.dot(self._w,X_val[i])<1).astype(int)
            train_errors.append(tr_err)
            val_errors.append(val_err)
            train_loss_epoch.append(tr_loss)
            val_loss_epoch.append(val_loss)
        self.history_w = np.array(self.history_w)    
        self.train_errors = np.array(train_errors)
        self.val_errors = np.array(val_errors)
        self.train_loss = np.array(train_loss_epoch)
        self.val_loss = np.array(val_loss_epoch)                    

    def predict(self, X:np.array) -> np.array:
        y_pred = []
        X_extended = add_bias_feature(X)
        for i in range(len(X_extended)):
            y_pred.append(np.sign(np.dot(self._w,X_extended[i])))
        return np.array(y_pred)         

    def hinge_loss(self, x, y):
        return max(0,1 - y*np.dot(x, self._w))

    def soft_margin_loss(self, x, y):
        return self.hinge_loss(x,y)+self._alpha*np.dot(self._w, self._w)

In [6]:
df = pd.read_csv('dataset.csv')
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])
X = df.drop([' salary'], axis=1)
y = df[' salary']

In [7]:
def pipeline(model, X, y, k_folds=5):
    kf = KFold(n_splits=k_folds, random_state=16, shuffle=True)
    scores = np.zeros(k_folds)
    precisions = np.zeros(k_folds)    
    recalls = np.zeros(k_folds)    
    for i, (train_index, val_index) in enumerate(kf.split(X, y)):
        X_train, y_train = X.loc[train_index], y.loc[train_index]
        X_test, y_test = X.loc[val_index], y.loc[val_index]
        if isinstance(model, SVM):
           model.fit(X_train, y_train, X_test, y_test)
           y_pred = model.predict(X_test)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
        #model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        confusions = confusion_matrix(y_test, y_pred)
        scores[i] = accuracy_score(y_test, y_pred)
        precisions[i] = precision_score(y_test, y_pred)
        recalls[i] = recall_score(y_test, y_pred)
        print("Confusion matrix:\n", confusions, "\n")
        print("Acuracy score: {}".format(scores[i]))
        print("Precision score: {}".format(precisions[i]))
        print("Recall score: {}\n".format(recalls[i]))
    return (scores, precisions, recalls)

Обучаем модели и получаем оценки метрик

In [18]:
%%time
lr = LR()
metrics = pipeline(lr, X, y)
print("Accuracy: ", metrics[0].mean())
print("Precision: ", metrics[1].mean())
print("Recall: ", metrics[2].mean())

Confusion matrix:
 [[1082   34]
 [1004   50]] 

Acuracy score: 0.5216589861751152
Precision score: 0.5952380952380952
Recall score: 0.04743833017077799

Confusion matrix:
 [[   0 1112]
 [   0 1058]] 

Acuracy score: 0.4875576036866359
Precision score: 0.4875576036866359
Recall score: 1.0

Confusion matrix:
 [[1112   17]
 [1007   34]] 

Acuracy score: 0.528110599078341
Precision score: 0.6666666666666666
Recall score: 0.03266090297790586

Confusion matrix:
 [[428 673]
 [142 926]] 

Acuracy score: 0.6242508068234209
Precision score: 0.5791119449656035
Recall score: 0.8670411985018727

Confusion matrix:
 [[1141   17]
 [ 975   36]] 

Acuracy score: 0.5426463808206546
Precision score: 0.6792452830188679
Recall score: 0.03560830860534125

Accuracy:  0.5408448753168335
Precision:  0.6015639187151738
Recall:  0.39654974805117954
CPU times: user 30.8 s, sys: 1min 11s, total: 1min 42s
Wall time: 12.9 s


In [19]:
%%time
knn = KNN()
metrics = pipeline(knn, X, y)
print("Accuracy: ", metrics[0].mean())
print("Precision: ", metrics[1].mean())
print("Recall: ", metrics[2].mean())

Confusion matrix:
 [[743 373]
 [358 696]] 

Acuracy score: 0.6631336405529954
Precision score: 0.6510757717492984
Recall score: 0.6603415559772297

Confusion matrix:
 [[743 369]
 [350 708]] 

Acuracy score: 0.6686635944700461
Precision score: 0.6573816155988857
Recall score: 0.6691871455576559

Confusion matrix:
 [[695 434]
 [333 708]] 

Acuracy score: 0.6465437788018433
Precision score: 0.6199649737302977
Recall score: 0.6801152737752162

Confusion matrix:
 [[727 374]
 [374 694]] 

Acuracy score: 0.6551406177962195
Precision score: 0.649812734082397
Recall score: 0.649812734082397

Confusion matrix:
 [[755 403]
 [308 703]] 

Acuracy score: 0.6721991701244814
Precision score: 0.635623869801085
Recall score: 0.695351137487636

Accuracy:  0.661136160349117
Precision:  0.6427717929923927
Recall:  0.6709615693760269
CPU times: user 13.9 s, sys: 1.13 s, total: 15 s
Wall time: 15 s


In [20]:
%%time
svm = SVM()
metrics = pipeline(svm, X, y)
print("Accuracy: ", metrics[0].mean())
print("Precision: ", metrics[1].mean())
print("Recall: ", metrics[2].mean())

Confusion matrix:
 [[   0 1116]
 [   0 1054]] 

Acuracy score: 0.4857142857142857
Precision score: 0.4857142857142857
Recall score: 1.0

Confusion matrix:
 [[   0 1112]
 [   0 1058]] 

Acuracy score: 0.4875576036866359
Precision score: 0.4875576036866359
Recall score: 1.0

Confusion matrix:
 [[   0 1129]
 [   0 1041]] 

Acuracy score: 0.47972350230414745
Precision score: 0.47972350230414745
Recall score: 1.0

Confusion matrix:
 [[   0 1101]
 [   0 1068]] 

Acuracy score: 0.49239280774550487
Precision score: 0.49239280774550487
Recall score: 1.0

Confusion matrix:
 [[   0 1158]
 [   0 1011]] 

Acuracy score: 0.4661134163208852
Precision score: 0.4661134163208852
Recall score: 1.0

Accuracy:  0.48230032315429183
Precision:  0.48230032315429183
Recall:  1.0
CPU times: user 2min 4s, sys: 0 ns, total: 2min 4s
Wall time: 2min 4s


In [17]:
%%time
nbc = NBC()
metrics = pipeline(nbc, X, y)
print("Accuracy: ", metrics[0].mean())
print("Precision: ", metrics[1].mean())
print("Recall: ", metrics[2].mean())

Confusion matrix:
 [[966 150]
 [376 678]] 

Acuracy score: 0.7576036866359447
Precision score: 0.8188405797101449
Recall score: 0.6432637571157496

Confusion matrix:
 [[965 147]
 [386 672]] 

Acuracy score: 0.7543778801843318
Precision score: 0.8205128205128205
Recall score: 0.6351606805293005

Confusion matrix:
 [[992 137]
 [373 668]] 

Acuracy score: 0.7649769585253456
Precision score: 0.8298136645962733
Recall score: 0.6416906820365034

Confusion matrix:
 [[948 153]
 [360 708]] 

Acuracy score: 0.7634854771784232
Precision score: 0.8222996515679443
Recall score: 0.6629213483146067

Confusion matrix:
 [[995 163]
 [331 680]] 

Acuracy score: 0.7722452743199631
Precision score: 0.8066429418742586
Recall score: 0.6726013847675568

Accuracy:  0.7625378553688017
Precision:  0.8196219316522881
Recall:  0.6511275705527434
CPU times: user 594 ms, sys: 24 ms, total: 618 ms
Wall time: 597 ms


Коробочные решения

In [12]:
def LoRtrainer(X,y,final = False):
    print('Logistic Regression')
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X,y)
    if final:
        return classifier
    else:
        accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = 10)
        print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
        print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
        print('')
        
def KNNtrainer(X,y,final = False):
    print('KNN Classifier')
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X,y)
    if final:
        return classifier
    else:
        accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = 10)
        print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
        print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
        print('')

def NBCtrainer(X,y,final = False):
    print('Naive Bayes Classifier')
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X,y)
    if final:
        return classifier
    else:
        accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = 10)
        print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
        print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
        print('')

def SVCtrainer(X,y,final = False):
    print('SVM Classifier')
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear')
    classifier.fit(X,y)
    if final:
         return classifier
    else:
        accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = 10)
        print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
        print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
        print('')
        
        
        
def TestAll(X,y):
    LoRtrainer(X,y)
    KNNtrainer(X[: 7000],y[: 7000])
    NBCtrainer(X,y)
    SVCtrainer(X[: 100],y[: 100])

TestAll(X, y)

Logistic Regression
Accuracy: 73.95 %
Standard Deviation: 1.23 %

KNN Classifier
Accuracy: 65.94 %
Standard Deviation: 1.64 %

Naive Bayes Classifier
Accuracy: 76.12 %
Standard Deviation: 0.76 %

SVM Classifier
Accuracy: 74.00 %
Standard Deviation: 10.20 %



In [21]:
import pickle
with open("my_models.pkl", "wb") as f:
    pickle.dump(lr, f)
    pickle.dump(knn, f)
    pickle.dump(svm, f)
    pickle.dump(nbc, f)

#### Вывод
 В результате проделанной лабораторной были реализованы методы LR, SVM, KNN, Naive Bayes. По результатам проделанной реботы можно сделать вывод, что для данного набора данных лучше всего подходит Naive Bayes, который дает точность около 82%. Следует заметить, что метод SVM обучается очень долго на данной выборке, поэтому было сильно уменьшено количество входных данных. Точность построенных моделей примерно совпадает с коробочными решениями.