In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

https://archive.ics.uci.edu/dataset/81/pen+based+recognition+of+handwritten+digits  
https://archive.ics.uci.edu/dataset/59/letter+recognition   
https://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits    

In [2]:
header = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df = pd.read_csv('data/cars/car.data' ,names=header)

#header = ['class', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
#df = pd.read_csv('letter_recognition/letter-recognition.data', names=header)

df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [3]:
# подсчёт сэмплов каждого класса
def calc_classes_exmpls(dataframe):
    for cls in dataframe['class'].unique():
        print(f"{cls}: {len(dataframe[dataframe['class'] == cls])}")

In [4]:
for column in df.columns:
    print(f"For columns '{column}' unique values are {df[column].unique()}")
print()
calc_classes_exmpls(df)

For columns 'buying' unique values are ['vhigh' 'high' 'med' 'low']
For columns 'maint' unique values are ['vhigh' 'high' 'med' 'low']
For columns 'doors' unique values are ['2' '3' '4' '5more']
For columns 'persons' unique values are ['2' '4' 'more']
For columns 'lug_boot' unique values are ['small' 'med' 'big']
For columns 'safety' unique values are ['low' 'med' 'high']
For columns 'class' unique values are ['unacc' 'acc' 'vgood' 'good']

unacc: 1210
acc: 384
vgood: 65
good: 69


In [None]:
tmp = df[df['class'] == 'unacc'].sample(826)
new_df = df.drop(index=tmp.index)
#new_df = df.drop(df[df['class'] == 'unacc'].sample(826))
calc_classes_exmpls(new_df)

In [None]:
df=new_df

In [None]:
le = preprocessing.LabelEncoder()
original_classes = np.unique(df['class'].values)
for column_name in df.columns:
    df[column_name] = le.fit_transform(df[column_name])
y = df['class']
X = df.loc[:, df.columns != 'class']

In [None]:
df

In [None]:
X = X.values
Y=y.values.astype(int)
feature_num = X.shape[1]
classes_num = len(pd.unique(y))
print('Numb of features: ', feature_num)
print('Numb of classes: ', classes_num)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(x_train.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion='log_loss', random_state=0)
clf.fit(x_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
x_train_lr = preprocessing.StandardScaler().fit(x_train).transform(x_train)
x_test_lr = preprocessing.StandardScaler().fit(x_test).transform(x_test)

clf = LogisticRegression(random_state=1, max_iter=1000, penalty=None, fit_intercept=False)
clf.fit(x_train_lr, y_train)
y_preds = clf.predict(x_test_lr)
print(clf.score(x_test_lr, y_test))
print(classification_report(y_test, y_preds, target_names=list(map(str, original_classes.tolist())), digits=2, zero_division=True))

In [None]:
epsilon=1e-10
class Softmax:
    @staticmethod
    def activation(z):
        z_exp = np.exp(z)
        return z_exp / np.sum(z_exp, axis=1).reshape(-1,1)
    @staticmethod
    def derivative(z):
        softm = Softmax.activation(z)
        return (softm*(1-softm))
    
class CrossEntropy:
    @staticmethod
    def forward(y_true, y_pred):
        return np.mean(-1 * y_true * np.log(y_pred + epsilon) + (1 - y_true)*np.log(1-y_pred+epsilon))
    @staticmethod
    def backward(y_true, y_pred):
        return -y_true / y_pred + (1-y_true)/(1-y_pred)


def data_gen(x, y, batch_size):
    n_samples = x.shape[0]
    for i in range(0, n_samples, batch_size):
        yield(x[i:i+batch_size, :], y[i:i+batch_size, :])

def get_onehot(y, classes=[]):
    classes = np.unique(y)
    labels = {c:i for i,c in enumerate(classes)}
    return np.eye(len(classes))[np.vectorize(lambda cls: labels[cls])(y).reshape(-1)]

In [None]:
class LogicRegressor:
    def __init__(self, num_features, num_classes, orig_classes) -> None:
        self.num_features = num_features
        self.num_classes = num_classes
        self.orig_classes = orig_classes

        self._init_weights()
        self._init_biases()
    
    def _init_weights(self):
        limit = np.sqrt(6/(self.num_features*self.num_features))
        self.weights = np.random.uniform(-limit, limit, size=(self.num_classes, self.num_features))
    
    def _init_biases(self):
        self.biases = np.random.random(size=(1, self.num_classes))

    def loss(self, y_pred, y_true):
        return CrossEntropy.forward(y_true, y_pred)

    def loss_d(self, y_pred, y_true):
        pass
        error = (y_pred - y_true)
        return error
        

    def activation(self, x):
        return Softmax.activation(x)
    def activation_d(self, y):
        return Softmax.derivative(y)
    
    def predict(self, x):
        self.inp = x
        y = np.matmul(x, self.weights.T) + self.biases
        return self.activation(y.astype(float))
    
    def _forward(self, x):
        self.inp = x
        y_pred = np.matmul(x, self.weights.T) + self.biases
        return y_pred
    
    def _backward(self, loss_d, lr=0.01):
        self.weights = self.weights - lr * np.matmul(loss_d.T, self.inp)
        self.biases -= lr * loss_d.mean()
  
    
    def train(self, x_train, y_train, x_test=None, y_test=None, epochs=10, lr=0.01, batch_size=351):
        losses = np.array([])
        accuracy = np.array([])
        y_train = get_onehot(y_train)
        
        for epoch in range(epochs): 
            for x, y in data_gen(x_train, y_train, batch_size=batch_size):
                y_pred = self.predict(x)
                epoch_loss = self.loss(y_pred, y).mean()
                
                loss_d = self.loss_d(y_pred, y)
                self._backward(loss_d, lr=lr)
            
            losses = np.append(losses, epoch_loss)

            if not epoch%1000: print(f'Loss on {epoch} epoch: ', losses[-1])
        return losses, accuracy
    
    def score(self, x_test, y_test, threshold=0.5):
       
        # Преобразование предсказаний в Dummies
        y_preds = self.predict(x_test)
        y_preds[y_preds>=threshold] = 1
        y_preds[y_preds<threshold] = 0
        y_preds = y_preds.astype(bool)
        
        # Аналогично с GT значениями
        y_true = get_onehot(y_test)
        y_true = y_true.astype(bool)
        # Получаем метрики
        print(classification_report(y_true, y_preds, target_names=list(map(str, self.orig_classes.tolist())), digits=2, zero_division=True))
        acc = accuracy_score(y_true, y_preds)
        print('Accuracy: ', acc)
        return acc



In [None]:
import matplotlib.pyplot as plt
num_features = x_train.shape[1]
num_classes = len(np.unique(y_train))
print(num_classes)

x_train_scaled = preprocessing.StandardScaler().fit(x_train).transform(x_train)
x_test_scaled = preprocessing.StandardScaler().fit(x_test).transform(x_test)

LRC = LogicRegressor(
                    num_features=num_features, 
                    num_classes=num_classes, 
                    orig_classes=original_classes
                    )
losses, accuracy = LRC.train(x_train, y_train, x_test, y_test, epochs=2000, batch_size=len(x_train), lr=1e-5)
LRC.score(x_test, y_test)


fig = plt.figure(figsize=(8,6))
plt.plot(np.arange(len(losses)), losses)
plt.title("Loss during training")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

if accuracy.shape[0] > 0:
    plt.plot(np.arange(len(accuracy)), accuracy)
    plt.title("Accuracy during training")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.show()

# Random Forest

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tree import CART
from forest import RandomForest

def get_accuracy(y_true, y_preds):
    correct = np.sum(y_preds == y_true)

    return correct / y_true.shape[0]

df = pd.read_csv("cars/car.data", header=None).values

df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True)
#x_test, y_test
tree_1 = CART(max_depth=10)
tree_1.fit(df_train)

y_preds_1 = tree_1.predict(df_test[:, :-1])

print(get_accuracy(df_test[:, -1], y_preds_1))

forest = RandomForest(30, 30, 100)
forest.fit(df_train)
y_preds = forest.predict(df_test[:, :-1])

# print(y_preds)
# print(accuracy_score(df_test[:, :-1], y_preds))

print(get_accuracy(df_test[:, -1], y_preds))

