In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, IncrementalPCA, SparsePCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy import sparse
import pickle
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
titles = pd.read_csv('data/clean_titles.csv')

In [3]:
df = pd.read_csv('data/train_groups.csv')

In [4]:
df = pd.merge(df, titles, how='left', on='doc_id')

In [5]:
df.head()

Unnamed: 0,pair_id,group_id,doc_id,target,clean_title
0,1,1,15731,0,ваз замена подшипник ступица нива
1,2,1,14829,0,ваз опт сочи сравнивать цена купить потребител...
2,3,1,15764,0,купить ступица лада калина трансмиссия переход...
3,4,1,17669,0,классика
4,5,1,14852,0,ступица нива замена подшипник свой рука


In [7]:
df['clean_title'].apply(lambda x: len(str(x).split())).max()

89

In [None]:
titles.join(pd.DataFrame(data_vect_count, columns=[f'{i}_f' for i in range(data_vect_count.shape[1])]))

In [18]:
dfs = []
for group_id in df['group_id'].unique():
    sample = df[df['group_id'] == group_id]
    vectorizer = CountVectorizer()
    vect = vectorizer.fit_transform(sample['clean_title'].fillna('')).todense()
    vect = pd.DataFrame(vect, columns=[f'{i}_f' for i in range(vect.shape[1])])
    sample = sample.join(vect)
    dfs.append(sample)

In [20]:
dfs[0].head()

Unnamed: 0,pair_id,group_id,doc_id,target,clean_title,0_f,1_f,2_f,3_f,4_f,...,268_f,269_f,270_f,271_f,272_f,273_f,274_f,275_f,276_f,277_f
0,1,1,15731,0,ваз замена подшипник ступица нива,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,14829,0,ваз опт сочи сравнивать цена купить потребител...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,15764,0,купить ступица лада калина трансмиссия переход...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,17669,0,классика,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,14852,0,ступица нива замена подшипник свой рука,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
def get_devs(df, n_vecs):
    df_list = []
    for i in tqdm(df['group_id'].unique(), total=129):
        df_i = df[df['group_id'] == i]
        vect_i = np.array(df_i.loc[:, [f'{j}_f' for j in range(n_vecs)]])
        mean_i = np.mean(vect_i, axis=0)
        df_i['dev'] = np.apply_along_axis(lambda x: cosine(x, mean_i), axis=1, arr=vect_i)
        df_list.append(df_i)
    df_with_dev = pd.concat(df_list)
    df = df_with_dev.drop([f'{i}_f' for i in range(n_vecs)], 1)
    return df

In [30]:
new_dfs = []
for df in dfs:
    n_vecs = int(df.columns[-1].split('_')[0]) + 1
    df = get_devs(df, n_vecs=n_vecs)
    new_dfs.append(df)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [32]:
df = pd.concat(new_dfs)

In [43]:
class ThresholdModel(BaseEstimator, ClassifierMixin):
    
    def fit(self, X, y):
        scores = []
        ths = np.arange(0, 1, 0.01)
        for th in ths:
            scores.append(f1_score(y, (X < th)))
        self.train_score = max(scores)
        ind = scores.index(self.train_score)
        self.th_ = ths[ind]
        return self
    
    def predict(self, X):
        return X < self.th_

In [45]:
def cross_val_score(model, df, train_subset=['dev'], cv=5):
    result = []
    indices = df['group_id'].unique()
    for i in range(cv):
        train_ids = np.random.choice(indices, indices.shape[0] // 2, replace=False)
        test_ids = indices[~np.isin(indices, train_ids)]
        train_data = df[df['group_id'].isin(train_ids)]
        test_data = df[df['group_id'].isin(test_ids)]
        model.fit(np.array(train_data.loc[:, train_subset].fillna(0)).reshape(-1, 1), train_data['target'])
        score = f1_score(model.predict(np.array(test_data.loc[:, train_subset].fillna(0)).reshape(-1, 1)), test_data['target'])
        result.append(score)
    return sum(result) / len(result), model.th_

def linear_model_cross_val_score(model, df, train_subset=['dev'], cv=5):
    indices = df['group_id'].unique()
    result = []
    ths = 0
    for i in range(cv):
        train_ids = np.random.choice(indices, indices.shape[0] // 2, replace=False)
        test_ids = indices[~np.isin(indices, train_ids)]
        train_data = df[df['group_id'].isin(train_ids)]
        test_data = df[df['group_id'].isin(test_ids)]
        model.fit(train_data.loc[:, train_subset].fillna(0), train_data['target'])
        proba = model.predict_proba(test_data.loc[:, train_subset].fillna(0))
        score, th = get_th(proba, test_data['target'])
        ths += th
        result.append(score)
    return sum(result) / len(result), ths / len(result)

def get_th(proba, y_test):
    scores = []
    ths = np.arange(0.1, 0.9, 0.1)
    for i in ths:
        scores.append(f1_score(y_test, (proba > i)[:, 1]))
    ind = scores.index(max(scores))
    th = np.arange(0.1, 0.9, 0.1)[ind]
    score = f1_score((proba > th)[:, 1], y_test)
    return score, th

In [46]:
model = ThresholdModel()
print('My simple model:', cross_val_score(model, df))

model = LogisticRegression()
print('Logistic regression:', linear_model_cross_val_score(model, df))

My simple model: (0.41583094040822727, 0.54)
Logistic regression: (0.455324816544569, 0.1)
