In [None]:
%pip install sklearn

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import random
from collections import Counter

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
class MyLogReg():
    def __init__(self, n_iter=10, learning_rate=0.1, weights=[], metric=None, reg=None, l1_coef=0, l2_coef=0, sgd_sample=None, random_state=42):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = weights
        self.metric = metric
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.sgd_sample = sgd_sample
        self.random_state = random_state
        
    def __str__(self):
        return f"{self.__class__.__name__} class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"


    def fit(self, x, y, verbose=False):
      random.seed(self.random_state)

      self.weights = np.ones(len(x.columns) + 1)
      x.insert(0, 'x0', 1)
      n = len(y)
      for i in range(1, self.n_iter + 1):
        x_sample = x
        y_sample = y
        if self.sgd_sample is not None:
            if isinstance(self.sgd_sample, float):
                sample_size = max(int(round(self.sgd_sample * x.shape[0])), 1)
            else:
                sample_size = int(self.sgd_sample)
            sample_rows_idx = random.sample(range(x.shape[0]), sample_size)
            x_sample = x.iloc[sample_rows_idx]
            y_sample = y.iloc[sample_rows_idx]
        current_learning_rate = self.learning_rate(i) if callable(self.learning_rate) else self.learning_rate
        predict = 1 / (1 + np.exp(-x_sample.dot(self.weights)))
        logloss = -1 / n * sum(y * np.log(predict) + (1 - y) * np.log(1 - predict)) + self.calculate_reg()
        grad = 1 / len(y_sample) * (self.predict_proba(x_sample) - y_sample).dot(x_sample) + self.calculate_reg_grad()
        self.weights -= current_learning_rate * grad
        if verbose and i == 0:
          self.metric_val = self.get_score(x, y)
          print(f'start | loss: {logloss} | {self.metric}: {self.metric_val}')
        elif (verbose and i % verbose == 0):
          self.metric_val = self.get_score(x, y)
          print(f'{i} | loss: {logloss} | {self.metric}: {self.metric_val}')
      if not verbose:
        self.metric_val = self.get_score(x, y)
        

    def get_coef(self):
      return self.weights.values[1:]

    def predict_proba(self, x):
      if x.columns[0] != 'x0':
          x.insert(0, 'x0', 1)
      return 1 / (1 + np.exp(-x.dot(self.weights)))

    def predict(self, x):
      proba = self.predict_proba(x)
      return [1 if p > 0.5 else 0 for p in proba]
    
    def get_best_score(self):
        return self.metric_val
    
    def get_score(self, x, y):
      y_pred = np.round(self.predict_proba(x), 10)
      cm = confusion_matrix(y, self.predict(x))
      TN, FP, FN, TP = cm.ravel()
      precision = TP / (TP + FP)
      recall = TP / (TP + FN)
      
      if self.metric == 'accuracy':
        return (TP + TN) / (TP + TN + FP + FN)
      elif self.metric == 'precision':
        return precision
      elif self.metric == 'recall':
        return recall
      elif self.metric == 'f1':
        return 2 * precision * recall / (recall + precision)
      elif self.metric == 'roc_auc':       
        return roc_auc_score(y_true=y, y_score=self.predict_proba(x))
    def calculate_reg(self):
      if self.reg == 'l1':
        return self.l1_coef * sum(abs(self.weights))
      elif self.reg == 'l2':
        return self.l2_coef * sum(np.square(self.weights))
      elif self.reg == 'elasticnet':
        return self.l1_coef * sum(abs(self.weights)) + self.l2_coef * sum(np.square(self.weights))
      return 0


    def calculate_reg_grad(self):
      if self.reg == 'l1':
        return self.l1_coef * np.sign(self.weights)
      elif self.reg == 'l2':
        return self.l2_coef * 2 * self.weights
      elif self.reg == 'elasticnet':
        return self.l1_coef * np.sign(self.weights) + self.l2_coef * 2 * self.weights
      return 0


In [4]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'x{col + 1}' for col in X.columns]

model = MyLogReg(metric='roc_auc')
model.fit(X, y, 5)
model.get_best_score()



5 | loss: 2.6812948097440965 | roc_auc: 0.5696102784411138
10 | loss: 1.8729518276291253 | roc_auc: 0.6219904879619518


np.float64(0.6219904879619518)

In [34]:
from collections import Counter



class MyKNNClf:
    def __init__(self, k=3, metric='euclidean', weight='uniform'):
        self.weight = weight
        self.metric = metric
        self.k = k
        self.train_size = None

    def fit(self, X, y):
        self.X = X.copy()
        self.y = y.copy()
        self.train_size = X.shape

    def __str__(self):
        return f"MyKNNClf class: k={self.k}"

    def predict(self, X_test):
            predictions = []
            

            for i, test_row in X_test.iterrows():
                distances = self.calculate_distance(test_row)

                sorted_indices = np.argsort(distances)[:self.k]
                nearest_labels = self.y.iloc[sorted_indices]
                if self.weight == 'uniform':
                    counter_dic = Counter(nearest_labels)
                    most_common_label = 0 if counter_dic[1] < counter_dic[0] else 1
                elif self.weight == 'rank':
                    weights = 1 / np.arange(1, self.k + 1)
                    class_weight = [0, 0]
                    for i, mark in enumerate(nearest_labels):
                        class_weight[mark] += weights[i]
                    most_common_label = class_weight.index(max(class_weight))
                else:
                    weights = 1 / distances[sorted_indices]
                    class_weight = [0, 0]
                    for i, mark in enumerate(nearest_labels):
                        class_weight[mark] += weights.values[i]
                    most_common_label = class_weight.index(max(class_weight))
                
                predictions.append(most_common_label)

            # print(predictions)
            return np.array(predictions)



    def predict_proba(self, X_test):
        proba = []
        for i, test_row in X_test.iterrows():
            
            distances = self.calculate_distance(test_row)

            sorted_indices = np.argsort(distances)[:self.k]
            nearest_labels = self.y.iloc[sorted_indices]

            if self.weight == 'uniform':
                class_weight = sum(nearest_labels == 1) / self.k
            elif self.weight == 'rank':
                weights = 1 / np.arange(1, self.k + 1)
                total_weight = sum(weights)
                class_1_weight = 0
                for i, mark in enumerate(nearest_labels):
                    if mark == 1:
                        class_1_weight += weights[i]
                class_weight = class_1_weight / total_weight
            else:
                weights = 1 / distances[sorted_indices]
                total_weight = sum(weights)
                class_1_weight = 0
                for i, mark in enumerate(nearest_labels):
                    if mark == 1:
                        class_1_weight += weights.values[i]
                class_weight = class_1_weight / total_weight

            proba.append(class_weight)
        
        

        return np.array(proba)
    

    def calculate_distance(self, test_row):
        if self.metric == 'euclidean':
            return np.sqrt(((self.X - test_row) ** 2).sum(axis=1))
        elif self.metric == 'chebyshev':
            return (abs(self.X - test_row)).max(axis=1)
        elif self.metric == 'manhattan':
            return (abs(self.X - test_row)).sum(axis=1)
        else:
            return 1 - (self.X * test_row).sum(axis=1) / (np.sqrt((self.X ** 2).sum(axis=1)) * np.sqrt((test_row ** 2).sum()))




In [5]:
class MyKNNReg():
    def __init__(self, k=3, metric='euclidean', weight='uniform'):
        self.k = k
        self.train_size = None
        self.metric = metric
        self.weight = weight
        
    def __str__(self):
        return f"MyKNNReg class: k={self.k}"
    
    def fit(self, x, y):
        self.train_size = np.shape(x)
        self.X = x
        self.y = y

    def predict(self, x_train):
        predictions = []
        for _, test_row in x_train.iterrows():
            distances = self.calculate_metric(test_row)
            nearest_neighbours = distances.nsmallest(self.k).index
            target_values = self.y.iloc[nearest_neighbours]

            if self.weight == 'uniform':
                predict_value = np.average(target_values)
            elif self.weight == 'rank':
                total_weight = sum(1 / np.array(1, self.k + 1))
                weights = (1 / np.arange(1, self.k + 1)) / total_weight
                predict_value = weights * target_values
            else:
                total_weight = sum(1 / distances[target_values.index])
                weights = (1 / distances[target_values.index]) / total_weight
                predict_value = weights * target_values

            predictions.append(predict_value)
        return np.array(predictions)
    

    def calculate_metric(self, test_row):
        if self.metric == "euclidean":
            return np.sqrt(((self.X - test_row) ** 2).sum(axis=1))
        elif self.metric == "chebyshev":
            return (abs(self.X - test_row)).max(axis=1)
        elif self.metric == "manhattan":
            return abs(self.X - test_row).sum(axis=1)
        else:
            return 1 - (self.X * test_row).sum(axis=1) / (np.sqrt((self.X ** 2).sum(axis=1)) * np.sqrt((test_row ** 2).sum()))




In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X_train = pd.DataFrame(X)
y_train = pd.Series(y)
X_train.columns = [f'col_{col}' for col in X_train.columns]

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X_test = pd.DataFrame(X)
y_test = pd.Series(y)
X_test.columns = [f'col_{col}' for col in X_test.columns]

# Создаем и обучаем модель
knn = MyKNNClf(4, 'manhattan', 'distance')
knn.fit(X_train, y_train)

# Применяем модель для предсказания классов
predictions = knn.predict(X_test)

# Выводим результат предсказаний
print("Predictions:", sum(predictions))

proba = knn.predict_proba(X_test)

print("Proba:", sum(proba))


In [49]:
from collections import Counter


class MyTreeClf:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None, criterion="entropy"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 1
        self.leafs_sum = 0
        self.bins = bins
        self.criterion = criterion

    def __str__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}"
    

    def calculate_entropy(self, y):
        value_counts = y.value_counts(normalize=True) 
        entropy = -np.sum(value_counts * np.log2(value_counts))
        return entropy
    
    def calculate_gini(self, y):
        value_counts = y.value_counts(normalize=True)
        gini = 1 - np.sum(value_counts ** 2)
        return gini
      

    def calculate_information_gain(self, y, y_left, y_right):
        if self.criterion == "entropy":
            ig = self.calculate_entropy(y) - len(y_left) / len(y) * self.calculate_entropy(y_left) - len(y_right) / len(y) * self.calculate_entropy(y_right)
        else:
            ig = self.calculate_gini(y) - len(y_left) / len(y) * self.calculate_gini(y_left) - len(y_right) / len(y) * self.calculate_gini(y_right)
        return ig
    

    def get_best_split(self, X, y):
        best_ig = -1
        best_col = None
        best_split_value = None

        for col in X.columns:
            unique_values = np.sort(X[col].unique())

            if self.bins is None or len(unique_values) <= self.bins - 1:
                for i in range(1, len(unique_values)):
                    split_value = (unique_values[i - 1] + unique_values[i]) / 2

                    left_mask = X[col] <= split_value
                    right_mask = X[col] > split_value

                    y_left, y_right = y[left_mask], y[right_mask]

                    if len(y_left) > 0 and len(y_right) > 0:

                        ig = self.calculate_information_gain(y, y_left, y_right)

                        if ig > best_ig:
                            best_ig = ig
                            best_col = col
                            best_split_value = split_value
            else:
                hist, bin_edges = np.histogram(X[col], bins=self.bins)
                

                for i in range(1, len(bin_edges)):
                    split_value = bin_edges[i]
                    

                    left_mask = X[col] <= split_value
                    right_mask = X[col] > split_value

                    y_left, y_right = y[left_mask], y[right_mask]

                    if len(y_left) > 0 and len(y_right) > 0:

                        ig = self.calculate_information_gain(y, y_left, y_right)

                        if ig > best_ig:
                            best_ig = ig
                            best_col = col
                            best_split_value = split_value

        return (best_col, best_split_value, best_ig)

    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
    
    def build_tree(self, X, y, depth=0, leaf=None):

        cnt_dic = Counter(y)
        leaf_value = cnt_dic[1] / len(y)
        col, split_value, ig = self.get_best_split(X, y)

        if (self.leafs_cnt > 1 and self.leafs_cnt >= self.max_leafs) or depth >= self.max_depth or leaf_value == 0 or leaf_value == 1 or len(y) < self.min_samples_split or ig == 0 or split_value is None:
            self.leafs_sum += leaf_value
            if leaf is None:
                return {'leaf': leaf_value}
            return {'leaf' + leaf: leaf_value}
        
        self.leafs_cnt += 1

        left_mask = X[col] <= split_value
        right_mask = X[col] > split_value

        y_left, y_right = y[left_mask], y[right_mask]
        X_left, X_right = X[left_mask], X[right_mask]

        left_subtree = self.build_tree(X_left, y_left, depth + 1, "_left")
        right_subtree = self.build_tree(X_right, y_right, depth + 1, "_right")

        return {
            "feature": col,
            "split_value": split_value,
            "left": left_subtree,
            "right": right_subtree
        }

    def print_tree(self, node, depth=0):
        if len(node) == 1:
            if "leaf_left" in node:
                print(" " * 4 * depth + "leaf_left = " + str(node["leaf_left"]))
            else:
                print(" " * 4 * depth + "leaf_right = " + str(node["leaf_right"]))
            return
        print(" " * 4 * depth + node["feature"] + " <= " + str(node["split_value"]))
        self.print_tree(node["left"], depth + 1)
        self.print_tree(node["right"], depth + 1)
        return
    

    def predict(self, X):
        return [1 if c > 0.5 else 0 for c in self.predict_proba(X)]
    

    def predict_proba(self, X):
        predictions = []
        for i, row in X.iterrows():
            p = self.tree
            while len(p) != 1:
                feature = p["feature"]
                split_value = p["split_value"]
                if row[feature] <= split_value:
                    p = p["left"]
                else:
                    p = p["right"]
            predictions.append(p["leaf_left"] if "leaf_left" in p else p["leaf_right"])
        return predictions
            

In [51]:
obj_2 = MyTreeClf(4, 100, 17,16, "gini")
obj_2.fit(X, y)
obj_2.print_tree(obj_2.tree)
print(obj_2.leafs_cnt)
print(obj_2.leafs_sum)

variance <= 0.7580312500000002
    skewness <= 6.2704249999999995
        variance <= -0.3773262499999994
            curtosis <= 5.774475000000001
                leaf_left = 0.9965753424657534
                leaf_right = 0.8963414634146342
            curtosis <= -0.08158750000000037
                leaf_left = 1.0
                leaf_right = 0.26153846153846155
        variance <= -4.12065875
            leaf_left = 0.9736842105263158
            leaf_right = 0.0
    curtosis <= -4.4146937500000005
        leaf_left = 0.7142857142857143
        variance <= 1.896305
            curtosis <= -1.7564374999999997
                leaf_left = 0.6
                leaf_right = 0.009259259259259259
            variance <= 2.20470625
                leaf_left = 0.044444444444444446
                leaf_right = 0.0
11
5.496128895934583


In [50]:
df = pd.read_csv('test.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

obj = MyTreeClf(1, 1, 2, 8, "gini")
obj.fit(X, y)
obj.print_tree(obj.tree)
print(obj.leafs_cnt)
print(obj.leafs_sum)



variance <= -0.10864999999999991
    leaf_left = 0.8333333333333334
    leaf_right = 0.14781491002570693
2
0.9811482433590403


In [45]:
print(obj.print_tree(obj.tree))
print(Counter(obj.predict_proba(X)))

variance <= 0.320165
    leaf_left = 0.8112633181126332
    leaf_right = 0.1076923076923077
None
3.6216 0.320165
Counter({0.1076923076923077: 1372})


In [26]:
obj_1 = MyTreeClf(3, 2, 5)
obj_1.fit(X, y)
obj_1.print_tree(obj_1.tree)
print(obj_1.tree)
print(obj_1.leafs_cnt)
print(obj_1.leafs_sum)


<bound method NDFrame.head of       variance  skewness  curtosis  entropy
0      3.62160   8.66610   -2.8073 -0.44699
1      4.54590   8.16740   -2.4586 -1.46210
2      3.86600  -2.63830    1.9242  0.10645
3      3.45660   9.52280   -4.0112 -3.59440
4      0.32924  -4.45520    4.5718 -0.98880
...        ...       ...       ...      ...
1367   0.40614   1.34920   -1.4501 -0.55949
1368  -1.38870  -4.87730    6.4774  0.34179
1369  -3.75030 -13.45860   17.5932 -2.77710
1370  -3.56370  -8.38270   12.3930 -1.28230
1371  -2.54190  -0.65804    2.6842  1.19520

[1372 rows x 4 columns]>
variance <= {'feature': 'variance', 'split_value': np.float64(0.320165), 'left': {'feature': 'skewness', 'split_value': np.float64(5.86535), 'left': {'feature': 'curtosis', 'split_value': np.float64(6.21865), 'left': {'leaf_left': 0.9945205479452055}, 'right': {'leaf_right': 0.8397435897435898}}, 'right': {'feature': 'variance', 'split_value': np.float64(-3.4448999999999996), 'left': {'leaf_left': 0.975}, 'right'

In [9]:
obj_3 = MyTreeClf(5, 200, 10, 4)
obj_3.fit(X, y)
obj_3.print_tree(obj_3.tree)
print(obj_3.leafs_cnt)
print(obj_3.leafs_sum)

variance <= -0.10864999999999991
    skewness <= 6.2704249999999995
        curtosis <= 1.7235
            skewness <= 4.139049999999999
                leaf_left = 1.0
                leaf_right = 0.8461538461538461
            skewness <= -0.35642499999999977
                variance <= -1.4119474999999997
                    leaf_left = 1.0
                    leaf_right = 0.8541666666666666
                leaf_right = 0.4074074074074074
        leaf_right = 0.3333333333333333
    variance <= 1.6263400000000001
        curtosis <= -1.8004750000000005
            leaf_left = 0.797752808988764
            leaf_right = 0.1827956989247312
        curtosis <= -1.698975
            leaf_left = 0.05917159763313609
            leaf_right = 0.0
10
5.480781359107884


In [11]:
obj_4 = MyTreeClf(10, 40, 21, 10)
obj_4.fit(X, y)
obj_4.print_tree(obj_4.tree)
print(obj_4.leafs_cnt)
print(obj_4.leafs_sum)

variance <= 1.2780400000000007
    skewness <= 4.934189999999997
        variance <= -0.049580000000000624
            curtosis <= 6.692949999999999
                curtosis <= 2.1901399999999995
                    leaf_left = 1.0
                    skewness <= -0.5094400000000006
                        leaf_left = 1.0
                        leaf_right = 0.7272727272727273
                skewness <= -4.2987400000000004
                    variance <= -0.6744020000000006
                        leaf_left = 1.0
                        leaf_right = 0.3333333333333333
                    leaf_right = 0.0
            curtosis <= 0.21858000000000022
                variance <= 1.1318273
                    leaf_left = 1.0
                    leaf_right = 0.875
                curtosis <= 2.3008480000000002
                    leaf_left = 0.43478260869565216
                    leaf_right = 0.0
        variance <= -3.71542
            curtosis <= 1.244285
                leaf_left = 1.0


In [12]:
obj_5 = MyTreeClf(15, 20, 30, 6)
obj_5.fit(X, y)
obj_5.print_tree(obj_5.tree)

print(obj_5.leafs_cnt)
print(obj_5.leafs_sum)

variance <= -0.10864999999999903
    skewness <= 8.497483333333335
        variance <= -2.3243400000000003
            variance <= -3.0628666666666664
                leaf_left = 1.0
                skewness <= 1.5516000000000023
                    leaf_left = 1.0
                    leaf_right = 0.9473684210526315
            skewness <= 5.181099999999999
                curtosis <= 5.586333333333334
                    variance <= -0.4844833333333336
                        leaf_left = 1.0
                        curtosis <= 2.0558333333333327
                            leaf_left = 1.0
                            leaf_right = 0.6
                    skewness <= -4.4252
                        entropy <= -1.3625
                            leaf_left = 0.5
                            leaf_right = 1.0
                        leaf_right = 0.0
                leaf_right = 0.0
        variance <= -4.735386666666667
            leaf_left = 1.0
            leaf_right = 0.0
    variance <= 

In [None]:
print(obj.leafs_sum)