In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import random
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV


***Реализация линейной и логистической регрессии из 2 домашки***

In [2]:
def batch_generator(X, y, shuffle=True, batch_size=1):
    """
    Гератор новых батчей для обучения
    X          - матрица объекты-признаки
    y_batch    - вектор ответов
    shuffle    - нужно ли случайно перемешивать выборку
    batch_size - размер батча ( 1 это SGD, > 1 mini-batch GD)
    Генерирует подвыборку для итерации спуска (X_batch, y_batch)
    """
    if shuffle:
        new_indx = np.random.permutation(len(y))
        X = X[new_indx]
        y = y[new_indx]

    while True:
        indx = np.random.choice(len(y), size=batch_size, replace=False)
        X_batch = X[indx]
        y_batch = y[indx]
        yield (X_batch, y_batch)

def sigmoid(x):
    """
    Вычисляем значение сигмоида.
    X - выход линейной модели
    """

    sigm_value_x = 1 / (1 + np.exp(-x))
    return sigm_value_x


class MySGDClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, batch_generator, C=1, alpha=0.01, max_epoch=10,
                 model_type='lin_reg', batch_size=1):
        """
        batch_generator -- функция генератор, которой будем создавать батчи
        C - коэф. регуляризации
        alpha - скорость спуска
        max_epoch - максимальное количество эпох
        model_type - тим модели, lin_reg или log_reg
        """

        self.C = C
        self.alpha = alpha
        self.max_epoch = max_epoch
        self.batch_generator = batch_generator
        self.errors_log = {'iter': [], 'loss': []}
        self.model_type = model_type

    def calc_loss(self, X_batch, y_batch):
        """
        Считаем функцию потерь по батчу
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """

        reg = 1 / self.C * self.weights.T @ self.weights
        if self.model_type == 'lin_reg':
            prdct = X_batch @ self.weights
            delta = prdct - y_batch
            loss = 1 / len(y_batch) * delta.T @ delta + reg
        else:
            sgm = sigmoid(X_batch @ self.weights)
            loss = -(y_batch.T @ np.log(sgm) +
                     (1 - y_batch).T @ np.log(1 - sgm)) / len(y_batch) + reg
        return loss

    def calc_loss_grad(self, X_batch, y_batch):
        """
        Считаем  градиент функции потерь по батчу
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """
        if self.model_type == 'lin_reg':
            prdct = X_batch @ self.weights
            loss_grad = 2 * (X_batch.T @ (prdct - y_batch) /
                             len(y_batch) + 1 / self.C * self.weights)
        else:
            prdct = sigmoid(X_batch @ self.weights)
            loss_grad = (X_batch.T @ (prdct - y_batch) / len(y_batch) +
                         2 / self.C*self.weights)
        return loss_grad

    def update_weights(self, new_grad):
        self.weights -= self.alpha * new_grad
        return self

    def fit(self, X, y, btch_size=1):
        '''
        Обучение модели
        X - матрица объекты-признаки
        y - вектор ответов
        '''
        ones = np.ones((len(X), 1))
        X = np.concatenate((ones, X), axis=1)
        # Нужно инициализровать случайно веса
        self.weights = np.random.rand(len(X[0]), 1)
        max_batch_num = int(len(y)/btch_size)
        for i in range(self.max_epoch):    
            new_epoch_generator = self.batch_generator(X, y, True, btch_size)
            for batch_num, new_batch in enumerate(new_epoch_generator):
                X_batch = new_batch[0]
                y_batch = new_batch[1].reshape(-1, 1)
                batch_grad = self.calc_loss_grad(X_batch, y_batch)
                self.update_weights(batch_grad)
                batch_loss = self.calc_loss(X_batch, y_batch)
                # Подумайте в каком месте стоит посчитать ошибку для отладки модели
                self.errors_log['iter'].append(batch_num)
                self.errors_log['loss'].append(batch_loss)
                if batch_num == max_batch_num:
                    break
        return self

    def predict(self, X):
        '''
        Предсказание класса
        X - матрица объекты-признаки
        Не забудте тип модели (линейная или логистическая регрессия)!
        '''
        ones = np.ones((len(X), 1))
        X = np.concatenate((ones, X), axis=1)
        y_hat = X @ self.weights
        if self.model_type == 'log_reg':
            y_hat = sigmoid(y_hat)
        return y_hat

***Копипаст кода Севы***

In [12]:
doc_to_title = {}
with open('titles_norm_ru.csv', encoding='utf-8', newline='') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


In [13]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [14]:
y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, target_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_train.append(sorted(all_dist, reverse=True)[0:20]    )
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11690, 20) (11690,) (11690,)


In [15]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))

In [16]:
X_test = []
groups_test = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title) in enumerate(docs):
        groups_test.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_test.append(sorted(all_dist, reverse=True)[0:20]    )
X_test = np.array(X_test)
groups_test = np.array(groups_test)
print (X_test.shape, groups_test.shape)

(16627, 20) (16627,)


***Предсказываем***

In [17]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)



In [18]:
clsfr_log = MySGDClassifier(batch_generator, C=1, alpha=0.1,
                            max_epoch=1000, model_type='log_reg')
clsfr_log.fit(X_train, y_train, 1000)

X_test = scaler.transform(X_test)
prdct = clsfr_log.predict(X_test)

prdct = (prdct > 0.5).astype(int).flatten()
nums = list(np.arange(11691, 28318))

df = pd.DataFrame({'pair_id': nums, 'target': prdct})
df.to_csv('output_log.csv', index=False)

f1_score(y_train, (clsfr_log.predict(X_train) > 0.5).astype(int))



0.6760259179265659

In [20]:
clf = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=1)
clf.fit(X_train,y_train)
print(f1_score(y_train, (clf.predict(X_train))))

prdct = clf.predict(X_test)
nums = list(np.arange(11691, 28318))

df = pd.DataFrame({'pair_id': nums, 'target': prdct})
df.to_csv('output_random_forest.csv', index=False)


0.6687657430730479
