In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, IncrementalPCA, SparsePCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy import sparse
import pickle
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def get_vectors(data, vect_type='count', **voc_params):
    if vect_type == 'count':
        vectorizer = CountVectorizer(**voc_params)
    elif vect_type == 'tfidf':
            vectorizer = TfidfVectorizer(min_df=2)
    data_vect = vectorizer.fit_transform(data.fillna('')).todense()
    scaler = StandardScaler()
    data_vect = scaler.fit_transform(data_vect)
    return data_vect

def make_pca(data_vect):
    pca = PCA()
    pca.fit(data_vect)
    n_c = (pca.explained_variance_ratio_.cumsum() < 0.95).sum()
    pca = PCA(n_components=n_c)
    data_vect = pca.fit_transform(data_vect)
    return data_vect

def get_devs(df, n_vecs):
    df_list = []
    for i in tqdm(df['group_id'].unique(), total=129):
        df_i = df[df['group_id'] == i]
        vect_i = np.array(df_i.loc[:, [f'{j}_f' for j in range(n_vecs)]])
        mean_i = np.mean(vect_i, axis=0)
        df_i['dev'] = np.apply_along_axis(lambda x: cosine(x, mean_i), axis=1, arr=vect_i)
        df_list.append(df_i)
    df_with_dev = pd.concat(df_list)
    df = df_with_dev.drop([f'{i}_f' for i in range(n_vecs)], 1)
    return df

def show_hist(df, column='dev'):
    plt.hist(df[df['target'] == 0][column], bins=100, normed=True, label='Out of group')
    plt.hist(df[df['target'] == 1][column], bins=100, normed=True, alpha=0.7, label='In group')
    plt.title('dev distribution')
    plt.legend()

def get_th(proba, y_test):
    scores = []
    ths = np.arange(0.1, 0.9, 0.1)
    for i in ths:
        scores.append(f1_score(y_test, (proba > i)[:, 1]))
    ind = scores.index(max(scores))
    th = np.arange(0.1, 0.9, 0.1)[ind]
    score = f1_score((proba > th)[:, 1], y_test)
    return score, th

In [3]:
df = pd.read_csv('data/train_groups.csv')

In [4]:
titles = pd.read_csv('data/clean_titles.csv')
vect = get_vectors(titles['clean_title'])
titles = titles.join(pd.DataFrame(vect, columns=[f'{i}_f' for i in range(vect.shape[1])]))
df_count = pd.merge(df, titles, how='left', on='doc_id')
df_count = get_devs(df_count, titles.shape[1] - 2)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [5]:
voc_params = {'min_df': 5, 'max_df': 0.8}
headers = pd.read_csv('data/all_headers.tsv', sep='\t')
vect = get_vectors(headers['headers'], **voc_params)
vect = pd.DataFrame(vect, columns=[f'{i}_f' for i in range(vect.shape[1])])
vect['doc_id'] = headers['doc_id']
df_headers = pd.merge(df, vect, how='left', on='doc_id')
df_headers = get_devs(df_headers, vect.shape[1] - 1)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [6]:
df_count.head()

Unnamed: 0,pair_id,group_id,doc_id,target,clean_title,dev
0,1,1,15731,0,ваз замена подшипник ступица нива,0.336816
1,2,1,14829,0,ваз опт сочи сравнивать цена купить потребител...,0.789518
2,3,1,15764,0,купить ступица лада калина трансмиссия переход...,0.782316
3,4,1,17669,0,классика,0.975004
4,5,1,14852,0,ступица нива замена подшипник свой рука,0.417249


In [7]:
df_headers.head()

Unnamed: 0,pair_id,group_id,doc_id,target,dev
0,1,1,15731,0,0.53927
1,2,1,14829,0,0.901806
2,3,1,15764,0,0.693755
3,4,1,17669,0,0.965767
4,5,1,14852,0,0.567833


In [8]:
df_count = df_count.drop(['clean_title'], 1)

In [9]:
df = pd.merge(df_count, df_headers, how='left', on='pair_id')

In [11]:
df = df.drop(['doc_id_x', 'group_id_x', 'target_x'], 1)
df.head()

Unnamed: 0,pair_id,dev_x,group_id_y,doc_id_y,target_y,dev_y
0,1,0.336816,1,15731,0,0.53927
1,2,0.789518,1,14829,0,0.901806
2,3,0.782316,1,15764,0,0.693755
3,4,0.975004,1,17669,0,0.965767
4,5,0.417249,1,14852,0,0.567833


In [14]:
columns = list(df.columns)
columns[2] = 'group_id'
columns[3] = 'doc_id'
columns[4] = 'target'
df.columns = columns

In [16]:
df_for = pd.read_csv('/Users/michelle/Downloads/fich_9_coins_1_avg.csv')

In [17]:
df_for.head()

Unnamed: 0.1,Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,target
0,0,-0.566789,-0.58968,-0.610203,-0.606029,-0.604782,-0.602268,-0.599013,-0.595933,-0.596133,-0.622124,-0.497732,0
1,1,-0.510514,-0.520343,-0.54297,-0.553366,-0.563432,-0.561703,-0.557273,-0.553168,-0.547611,-0.54344,-0.572021,0
2,2,-0.629817,-0.659016,-0.663028,-0.664768,-0.667337,-0.663663,-0.661059,-0.658347,-0.658856,-0.660861,-0.46335,0
3,3,-0.533774,-0.550567,-0.54297,-0.539188,-0.535866,-0.529909,-0.525686,-0.539298,-0.539327,-0.536177,-0.616226,0
4,4,-0.536776,-0.550567,-0.550654,-0.543239,-0.547528,-0.546354,-0.617063,-0.61327,-0.611518,-0.610019,-0.559742,0


In [23]:
df_for = df_for.drop(['target', 'Unnamed: 0'], 1)

In [24]:
df = df.join(df_for)

In [25]:
df.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,target,dev_y,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10
0,1,0.336816,1,15731,0,0.53927,-0.566789,-0.58968,-0.610203,-0.606029,-0.604782,-0.602268,-0.599013,-0.595933,-0.596133,-0.622124,-0.497732
1,2,0.789518,1,14829,0,0.901806,-0.510514,-0.520343,-0.54297,-0.553366,-0.563432,-0.561703,-0.557273,-0.553168,-0.547611,-0.54344,-0.572021
2,3,0.782316,1,15764,0,0.693755,-0.629817,-0.659016,-0.663028,-0.664768,-0.667337,-0.663663,-0.661059,-0.658347,-0.658856,-0.660861,-0.46335
3,4,0.975004,1,17669,0,0.965767,-0.533774,-0.550567,-0.54297,-0.539188,-0.535866,-0.529909,-0.525686,-0.539298,-0.539327,-0.536177,-0.616226
4,5,0.417249,1,14852,0,0.567833,-0.536776,-0.550567,-0.550654,-0.543239,-0.547528,-0.546354,-0.617063,-0.61327,-0.611518,-0.610019,-0.559742


In [26]:
def linear_model_cross_val_score(model, df, train_subset=['dev'], cv=5):
    indices = df['group_id'].unique()
    result = []
    ths = 0
    for i in range(cv):
        train_ids = np.random.choice(indices, indices.shape[0] // 2, replace=False)
        test_ids = indices[~np.isin(indices, train_ids)]
        train_data = df[df['group_id'].isin(train_ids)]
        test_data = df[df['group_id'].isin(test_ids)]
        model.fit(train_data.loc[:, train_subset].fillna(0), train_data['target'])
        proba = model.predict_proba(test_data.loc[:, train_subset].fillna(0))
        score, th = get_th(proba, test_data['target'])
        ths += th
        result.append(score)
    return sum(result) / len(result), ths / len(result)

In [30]:
train_subset = list(df.columns)
train_subset.remove('pair_id')
train_subset.remove('group_id')
train_subset.remove('doc_id')
train_subset.remove('target')

In [42]:
model = LogisticRegression()
print(linear_model_cross_val_score(model, df, train_subset=train_subset))

(0.7221851288412935, 0.30000000000000004)


In [53]:
model = LogisticRegression(class_weight='balanced')
print(linear_model_cross_val_score(model, df, train_subset=train_subset, cv=10))

(0.7317063182798285, 0.55)


In [109]:
model = LGBMClassifier(class_weight='balanced', n_estimators=10000, max_depth=1, learning_rate=0.001)
print(linear_model_cross_val_score(model, df, train_subset=train_subset, cv=5))

(0.7445082277441145, 0.54)


In [129]:
df_test_for = pd.read_csv('/Users/michelle/Downloads/fich_9_coins_1_avg_test.csv')

In [130]:
df_test_for.head()

Unnamed: 0.1,Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,0,-0.659096,-0.670561,-0.671301,-0.677275,-0.693355,-0.690898,-0.689171,-0.697756,-0.695778,-0.510075
1,1,-0.626125,-0.630312,-0.627589,-0.623952,-0.625164,-0.62562,-0.621028,-0.617285,-0.616641,-0.593345
2,2,-0.568243,-0.564069,-0.563359,-0.565016,-0.557947,-0.555322,-0.552884,-0.54952,-0.547261,-0.673663
3,3,-0.691334,-0.707455,-0.710553,-0.713759,-0.715761,-0.713996,-0.712918,-0.711521,-0.70987,-0.495901
4,4,-0.409252,-0.407267,-0.417058,-0.441532,-0.437151,-0.44184,-0.453766,-0.447873,-0.438855,-0.403387


In [131]:
df_for.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10
0,-0.566789,-0.58968,-0.610203,-0.606029,-0.604782,-0.602268,-0.599013,-0.595933,-0.596133,-0.622124,-0.497732
1,-0.510514,-0.520343,-0.54297,-0.553366,-0.563432,-0.561703,-0.557273,-0.553168,-0.547611,-0.54344,-0.572021
2,-0.629817,-0.659016,-0.663028,-0.664768,-0.667337,-0.663663,-0.661059,-0.658347,-0.658856,-0.660861,-0.46335
3,-0.533774,-0.550567,-0.54297,-0.539188,-0.535866,-0.529909,-0.525686,-0.539298,-0.539327,-0.536177,-0.616226
4,-0.536776,-0.550567,-0.550654,-0.543239,-0.547528,-0.546354,-0.617063,-0.61327,-0.611518,-0.610019,-0.559742


In [113]:
df_test = pd.read_csv('data/test_groups.csv')

In [115]:
df_test_count = pd.merge(df_test, titles, how='left', on='doc_id')
df_test_count = get_devs(df_test_count, titles.shape[1] - 2)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [119]:
df_test_headers = pd.merge(df_test, vect, how='left', on='doc_id')
df_test_headers = get_devs(df_test_headers, vect.shape[1] - 1)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [120]:
df_test_count = df_test_count.drop(['clean_title'], 1)

In [121]:
df_test = pd.merge(df_test_count, df_test_headers, how='left', on='pair_id')

In [123]:
df_test = df_test.drop(['doc_id_x', 'group_id_x'], 1)

In [124]:
columns = list(df_test.columns)
columns[2] = 'group_id'
columns[3] = 'doc_id'
df_test.columns = columns

In [125]:
df_test_for = df_test_for.drop(['Unnamed: 0'], 1)

In [126]:
df_test = df_test.join(df_test_for)

In [127]:
df_test.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,dev_y,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,11691,0.778879,130,6710,0.548846,-0.659096,-0.670561,-0.671301,-0.677275,-0.693355,-0.690898,-0.689171,-0.697756,-0.695778,-0.510075
1,11692,0.93098,130,4030,0.94902,-0.626125,-0.630312,-0.627589,-0.623952,-0.625164,-0.62562,-0.621028,-0.617285,-0.616641,-0.593345
2,11693,0.588307,130,5561,0.720632,-0.568243,-0.564069,-0.563359,-0.565016,-0.557947,-0.555322,-0.552884,-0.54952,-0.547261,-0.673663
3,11694,0.511715,130,4055,0.436483,-0.691334,-0.707455,-0.710553,-0.713759,-0.715761,-0.713996,-0.712918,-0.711521,-0.70987,-0.495901
4,11695,0.885726,130,4247,0.937866,-0.409252,-0.407267,-0.417058,-0.441532,-0.437151,-0.44184,-0.453766,-0.447873,-0.438855,-0.403387


In [135]:
df1 = df.drop(['f9'], 1)

In [137]:
train_subset.remove('f9')

In [158]:
model = LGBMClassifier(class_weight='balanced', n_estimators=10000, max_depth=1, learning_rate=0.001)
print(linear_model_cross_val_score(model, df1, train_subset=train_subset, cv=5))

(0.7526514843692647, 0.56)


In [159]:
proba = model.predict_proba(df_test.drop(['pair_id', 'group_id', 'doc_id'], 1))[:, 1]

In [160]:
predict = (proba > 0.6)

In [164]:
df_test['target'] = predict
df_test['target'] = df_test['target'].apply(lambda x: int(x))

In [166]:
df_test.loc[:, ['pair_id', 'target']].to_csv('predictions/all_30_05.csv', index=False)

In [171]:
model_test = LGBMClassifier(class_weight='balanced', n_estimators=10000, max_depth=1, learning_rate=0.001)
print(linear_model_cross_val_score(model_test, df1, train_subset=train_subset, cv=5))

(0.7432408467413085, 0.6)


In [174]:
model_test = LGBMClassifier(class_weight='balanced', n_estimators=10000, max_depth=1, learning_rate=0.001)
model_test.fit(df1.drop(['pair_id', 'group_id', 'doc_id', 'target'], 1), df1['target'])

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split',
               learning_rate=0.001, max_depth=1, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=10000,
               n_jobs=-1, num_leaves=31, objective=None, random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [179]:
df1.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,target,dev_y,f0,f1,f2,f3,f4,f5,f6,f7,f8,f10
0,1,0.336816,1,15731,0,0.53927,-0.566789,-0.58968,-0.610203,-0.606029,-0.604782,-0.602268,-0.599013,-0.595933,-0.596133,-0.497732
1,2,0.789518,1,14829,0,0.901806,-0.510514,-0.520343,-0.54297,-0.553366,-0.563432,-0.561703,-0.557273,-0.553168,-0.547611,-0.572021
2,3,0.782316,1,15764,0,0.693755,-0.629817,-0.659016,-0.663028,-0.664768,-0.667337,-0.663663,-0.661059,-0.658347,-0.658856,-0.46335
3,4,0.975004,1,17669,0,0.965767,-0.533774,-0.550567,-0.54297,-0.539188,-0.535866,-0.529909,-0.525686,-0.539298,-0.539327,-0.616226
4,5,0.417249,1,14852,0,0.567833,-0.536776,-0.550567,-0.550654,-0.543239,-0.547528,-0.546354,-0.617063,-0.61327,-0.611518,-0.559742


In [180]:
df_test.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,dev_y,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,target
0,11691,0.778879,130,6710,0.548846,-0.659096,-0.670561,-0.671301,-0.677275,-0.693355,-0.690898,-0.689171,-0.697756,-0.695778,-0.510075,1
1,11692,0.93098,130,4030,0.94902,-0.626125,-0.630312,-0.627589,-0.623952,-0.625164,-0.62562,-0.621028,-0.617285,-0.616641,-0.593345,0
2,11693,0.588307,130,5561,0.720632,-0.568243,-0.564069,-0.563359,-0.565016,-0.557947,-0.555322,-0.552884,-0.54952,-0.547261,-0.673663,1
3,11694,0.511715,130,4055,0.436483,-0.691334,-0.707455,-0.710553,-0.713759,-0.715761,-0.713996,-0.712918,-0.711521,-0.70987,-0.495901,1
4,11695,0.885726,130,4247,0.937866,-0.409252,-0.407267,-0.417058,-0.441532,-0.437151,-0.44184,-0.453766,-0.447873,-0.438855,-0.403387,0


In [181]:
proba = model_test.predict_proba(df_test.drop(['pair_id', 'group_id', 'doc_id', 'target'], 1))[:, 1]
predict = (proba > 0.6)
df_test['target'] = predict
df_test['target'] = df_test['target'].apply(lambda x: int(x))
df_test.loc[:, ['pair_id', 'target']].to_csv('predictions/all_30_05_full_learn.csv', index=False)

# New features

In [185]:
test = test.drop(['Unnamed: 0'], 1)

In [186]:
train = train.drop(['Unnamed: 0'], 1)

In [195]:
train = train.drop(['target'], 1)

In [253]:
df_train = pd.merge(df_count, df_headers, how='left', on='pair_id')
df_train = df_train.drop(['doc_id_x', 'group_id_x', 'target_x'], 1)
columns = list(df_train.columns)
columns[2] = 'group_id'
columns[3] = 'doc_id'
columns[4] = 'target'
df_train.columns = columns
df_train.head()
df_train = df_train.join(train)
df_train.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,target,dev_y,f0,f1,f2,f3,...,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20
0,1,0.336816,1,15731,0,0.53927,-0.566789,-0.58968,-0.610203,-0.606029,...,-0.627503,-0.628299,-0.635232,-0.634167,-0.63751,-0.635761,-0.634948,-0.636714,-0.634179,-0.497732
1,2,0.789518,1,14829,0,0.901806,-0.510514,-0.520343,-0.54297,-0.553366,...,-0.545351,-0.543262,-0.552359,-0.555025,-0.552531,-0.550301,-0.549199,-0.551895,-0.571576,-0.572021
2,3,0.782316,1,15764,0,0.693755,-0.629817,-0.659016,-0.663028,-0.664768,...,-0.662891,-0.661799,-0.660225,-0.666361,-0.666293,-0.666583,-0.664961,-0.6645,-0.66399,-0.46335
3,4,0.975004,1,17669,0,0.965767,-0.533774,-0.550567,-0.54297,-0.539188,...,-0.533976,-0.556146,-0.557621,-0.572463,-0.570349,-0.568514,-0.569207,-0.567981,-0.564123,-0.616226
4,5,0.417249,1,14852,0,0.567833,-0.536776,-0.550567,-0.550654,-0.543239,...,-0.612336,-0.608972,-0.607607,-0.607339,-0.607356,-0.606341,-0.606365,-0.606004,-0.605859,-0.559742


In [201]:
df_test = pd.read_csv('data/test_groups.csv')
df_test_count = pd.merge(df_test, titles, how='left', on='doc_id')
df_test_count = get_devs(df_test_count, titles.shape[1] - 2)
df_test_headers = pd.merge(df_test, vect, how='left', on='doc_id')
df_test_headers = get_devs(df_test_headers, vect.shape[1] - 1)
df_test_count = df_test_count.drop(['clean_title'], 1)
df_test = pd.merge(df_test_count, df_test_headers, how='left', on='pair_id')
df_test = df_test.drop(['doc_id_x', 'group_id_x'], 1)
columns = list(df_test.columns)
columns[2] = 'group_id'
columns[3] = 'doc_id'
df_test.columns = columns
df_test = df_test.join(test)
df_test.head()

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




Unnamed: 0,pair_id,dev_x,group_id,doc_id,dev_y,f0,f1,f2,f3,f4,...,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20
0,11691,0.778879,130,6710,0.548846,-0.659096,-0.670561,-0.671301,-0.677275,-0.693355,...,-0.693079,-0.694578,-0.694276,-0.69319,-0.695121,-0.697131,-0.696816,-0.69521,-0.696421,-0.510075
1,11692,0.93098,130,4030,0.94902,-0.626125,-0.630312,-0.627589,-0.623952,-0.625164,...,-0.618814,-0.616187,-0.6202,-0.622339,-0.620243,-0.622012,-0.618846,-0.615725,-0.612686,-0.593345
2,11693,0.588307,130,5561,0.720632,-0.568243,-0.564069,-0.563359,-0.565016,-0.557947,...,-0.543389,-0.538983,-0.552195,-0.550245,-0.561863,-0.559844,-0.555412,-0.552406,-0.549541,-0.673663
3,11694,0.511715,130,4055,0.436483,-0.691334,-0.707455,-0.710553,-0.713759,-0.715761,...,-0.708164,-0.708831,-0.707635,-0.706863,-0.706543,-0.707493,-0.706067,-0.707335,-0.707403,-0.495901
4,11695,0.885726,130,4247,0.937866,-0.409252,-0.407267,-0.417058,-0.441532,-0.437151,...,-0.426189,-0.41902,-0.413757,-0.412272,-0.437489,-0.431623,-0.453654,-0.447324,-0.456197,-0.403387


In [202]:
train_subset = list(df_train.columns)

In [205]:
train_subset.remove('pair_id')
train_subset.remove('group_id')
train_subset.remove('doc_id')
train_subset.remove('target')

In [230]:
def linear_model_cross_val_score(model, df, train_subset=['dev'], cv=5):
    indices = df['group_id'].unique()
    result = []
    ths = 0
    kf = KFold(n_splits=cv, shuffle=True)
    
    for train_ids, test_ids in kf.split(indices):
#         train_ids = np.random.choice(indices, indices.shape[0] // 2, replace=False)
#         test_ids = indices[~np.isin(indices, train_ids)]
        train_data = df[df['group_id'].isin(train_ids)]
        test_data = df[df['group_id'].isin(test_ids)]
        model.fit(train_data.loc[:, train_subset].fillna(0), train_data['target'])
        proba = model.predict_proba(test_data.loc[:, train_subset].fillna(0))
        score, th = get_th(proba, test_data['target'])
        ths += th
        result.append(score)
    return sum(result) / len(result), ths / len(result)

In [237]:
model = LGBMClassifier(class_weight='balanced', n_estimators=50000, max_depth=1, learning_rate=0.001)
score, th = linear_model_cross_val_score(model, df_train, train_subset=train_subset, cv=5)
print(score, th)

0.7370199823028958 0.56


In [223]:
proba = model.predict_proba(df_test.drop(['pair_id', 'group_id', 'doc_id'], 1))[:, 1]
predict = (proba > 0.56)
df_test['target'] = predict
df_test['target'] = df_test['target'].apply(lambda x: int(x))
df_test.loc[:, ['pair_id', 'target']].to_csv('predictions/20f_30_05.csv', index=False)

In [229]:
model = LGBMClassifier(class_weight='balanced', n_estimators=10000, max_depth=1, learning_rate=0.001)
model.fit(df_train.drop(['target', 'doc_id', 'pair_id', 'group_id'], 1), df_train['target'])
proba = model.predict_proba(df_test.drop(['pair_id', 'group_id', 'doc_id', 'target'], 1))[:, 1]
predict = (proba > 0.56)
df_test['target'] = predict
df_test['target'] = df_test['target'].apply(lambda x: int(x))
df_test.loc[:, ['pair_id', 'target']].to_csv('predictions/20f_30_05.csv', index=False)

In [238]:
# сделать общий scaler

In [248]:
df_dist = pd.read_csv('data/dist_titles.csv')

In [254]:
df_dist = df_dist.drop(['group_id', 'doc_id', 'target', 'count'], 1)

In [255]:
df_dist.head()

Unnamed: 0,pair_id,mean,std,min,25%,50%,75%,max
0,1,0.761555,0.250863,0.062504,0.626651,0.834145,1.000496,1.003004
1,2,0.925562,0.128857,0.452345,0.88077,1.000874,1.001798,1.016542
2,3,0.911651,0.151603,0.439112,0.860085,1.000562,1.001146,1.010765
3,4,0.999378,0.004346,0.970347,0.999705,1.000198,1.000562,1.001958
4,5,0.79238,0.264225,0.025196,0.620202,0.900059,1.001198,1.004416


In [256]:
df_train = pd.merge(df_train, df_dist, how='left', on='pair_id')

In [257]:
df_train.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,target,dev_y,f0,f1,f2,f3,...,f18,f19,f20,mean,std,min,25%,50%,75%,max
0,1,0.336816,1,15731,0,0.53927,-0.566789,-0.58968,-0.610203,-0.606029,...,-0.636714,-0.634179,-0.497732,0.761555,0.250863,0.062504,0.626651,0.834145,1.000496,1.003004
1,2,0.789518,1,14829,0,0.901806,-0.510514,-0.520343,-0.54297,-0.553366,...,-0.551895,-0.571576,-0.572021,0.925562,0.128857,0.452345,0.88077,1.000874,1.001798,1.016542
2,3,0.782316,1,15764,0,0.693755,-0.629817,-0.659016,-0.663028,-0.664768,...,-0.6645,-0.66399,-0.46335,0.911651,0.151603,0.439112,0.860085,1.000562,1.001146,1.010765
3,4,0.975004,1,17669,0,0.965767,-0.533774,-0.550567,-0.54297,-0.539188,...,-0.567981,-0.564123,-0.616226,0.999378,0.004346,0.970347,0.999705,1.000198,1.000562,1.001958
4,5,0.417249,1,14852,0,0.567833,-0.536776,-0.550567,-0.550654,-0.543239,...,-0.606004,-0.605859,-0.559742,0.79238,0.264225,0.025196,0.620202,0.900059,1.001198,1.004416


In [266]:
df_train.shape

(11690, 34)

In [258]:
scaler = StandardScaler()
data_train = df_train.drop(['pair_id', 'group_id', 'target'], 1)
meta_train = df_train.loc[:, ['pair_id', 'group_id', 'target']]

In [260]:
data_train = scaler.fit_transform(data_train)

In [268]:
df_train_norm = meta_train.join(pd.DataFrame(data_train))

In [269]:
df_train_norm.head()

Unnamed: 0,pair_id,group_id,target,0,1,2,3,4,5,6,...,21,22,23,24,25,26,27,28,29,30
0,1,1,0,-2.427159,0.225391,-1.476334,-0.566789,-0.58968,-0.610203,-0.606029,...,-0.636714,-0.634179,-0.497732,-1.186846,1.662441,-1.294528,-1.315542,-0.865767,0.410772,-0.283876
1,2,1,0,-0.000276,0.11817,0.506058,-0.510514,-0.520343,-0.54297,-0.553366,...,-0.551895,-0.571576,-0.572021,0.127278,0.261169,-0.130815,0.002057,0.522894,0.429853,0.45993
2,3,1,0,-0.038885,0.229314,-0.631595,-0.629817,-0.659016,-0.663028,-0.664768,...,-0.6645,-0.66399,-0.46335,0.015812,0.522412,-0.170316,-0.105194,0.520295,0.420293,0.142497
3,4,1,0,0.99409,0.455762,0.855804,-0.533774,-0.550567,-0.54297,-0.539188,...,-0.567981,-0.564123,-0.616226,0.718734,-1.168873,1.415468,0.618732,0.517261,0.411741,-0.34134
4,5,1,0,-1.995969,0.120904,-1.32015,-0.536776,-0.550567,-0.550654,-0.543239,...,-0.606004,-0.605859,-0.559742,-0.939854,1.815907,-1.405894,-1.348977,-0.316777,0.421055,-0.206302


In [270]:
train_subset = list(df_train_norm.drop(['pair_id', 'group_id', 'target'], 1).columns)
print(train_subset)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [280]:
model = LogisticRegression(class_weight='balanced')
score, th = linear_model_cross_val_score(model, df_train_norm, train_subset=train_subset, cv=10)
print(score, th)

0.7334039094556503 0.49000000000000005


In [284]:
model = LGBMClassifier(class_weight='balanced', n_estimators=10000, max_depth=1, learning_rate=0.001)
score, th = linear_model_cross_val_score(model, df_train_norm, train_subset=train_subset, cv=5)
print(score, th)

0.74454099797529 0.5800000000000001


In [291]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
score, th = linear_model_cross_val_score(model, df_train_norm, train_subset=train_subset, cv=5)
print(score, th)

0.7448123485118308 0.5800000000000001


In [290]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=4, learning_rate=0.0001)
score, th = linear_model_cross_val_score(model, df_train_norm, train_subset=train_subset, cv=5)
print(score, th)

0.741443320713749 0.56


In [292]:
df_test.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,dev_y,f0,f1,f2,f3,f4,...,f12,f13,f14,f15,f16,f17,f18,f19,f20,target
0,11691,0.778879,130,6710,0.548846,-0.659096,-0.670561,-0.671301,-0.677275,-0.693355,...,-0.694578,-0.694276,-0.69319,-0.695121,-0.697131,-0.696816,-0.69521,-0.696421,-0.510075,1
1,11692,0.93098,130,4030,0.94902,-0.626125,-0.630312,-0.627589,-0.623952,-0.625164,...,-0.616187,-0.6202,-0.622339,-0.620243,-0.622012,-0.618846,-0.615725,-0.612686,-0.593345,0
2,11693,0.588307,130,5561,0.720632,-0.568243,-0.564069,-0.563359,-0.565016,-0.557947,...,-0.538983,-0.552195,-0.550245,-0.561863,-0.559844,-0.555412,-0.552406,-0.549541,-0.673663,1
3,11694,0.511715,130,4055,0.436483,-0.691334,-0.707455,-0.710553,-0.713759,-0.715761,...,-0.708831,-0.707635,-0.706863,-0.706543,-0.707493,-0.706067,-0.707335,-0.707403,-0.495901,1
4,11695,0.885726,130,4247,0.937866,-0.409252,-0.407267,-0.417058,-0.441532,-0.437151,...,-0.41902,-0.413757,-0.412272,-0.437489,-0.431623,-0.453654,-0.447324,-0.456197,-0.403387,0


In [293]:
df_test_dist = pd.read_csv('data/test_dist_titles.csv')
df_test_dist.head()

Unnamed: 0,pair_id,group_id,doc_id,count,mean,std,min,25%,50%,75%,max
0,11691,130,6710,97.0,0.924379,0.170746,0.407721,0.997128,1.000808,1.001945,1.007153
1,11692,130,4030,97.0,0.990702,0.041365,0.665101,0.994925,1.001045,1.001948,1.007048
2,11693,130,5561,97.0,0.894727,0.206106,0.27746,0.854345,1.00147,1.002502,1.019193
3,11694,130,4055,97.0,0.872997,0.247342,0.105951,0.819164,1.000056,1.000967,1.00547
4,11695,130,4247,97.0,0.977477,0.084223,0.510155,0.995918,1.000524,1.001139,1.004164


In [296]:
df_test_dist = df_test_dist.drop(['group_id', 'doc_id', 'count'], 1)

In [297]:
df_test_all = pd.merge(df_test, df_test_dist, how='left', on='pair_id')

In [298]:
df_test_all.head()

Unnamed: 0,pair_id,dev_x,group_id,doc_id,dev_y,f0,f1,f2,f3,f4,...,f19,f20,target,mean,std,min,25%,50%,75%,max
0,11691,0.778879,130,6710,0.548846,-0.659096,-0.670561,-0.671301,-0.677275,-0.693355,...,-0.696421,-0.510075,1,0.924379,0.170746,0.407721,0.997128,1.000808,1.001945,1.007153
1,11692,0.93098,130,4030,0.94902,-0.626125,-0.630312,-0.627589,-0.623952,-0.625164,...,-0.612686,-0.593345,0,0.990702,0.041365,0.665101,0.994925,1.001045,1.001948,1.007048
2,11693,0.588307,130,5561,0.720632,-0.568243,-0.564069,-0.563359,-0.565016,-0.557947,...,-0.549541,-0.673663,1,0.894727,0.206106,0.27746,0.854345,1.00147,1.002502,1.019193
3,11694,0.511715,130,4055,0.436483,-0.691334,-0.707455,-0.710553,-0.713759,-0.715761,...,-0.707403,-0.495901,1,0.872997,0.247342,0.105951,0.819164,1.000056,1.000967,1.00547
4,11695,0.885726,130,4247,0.937866,-0.409252,-0.407267,-0.417058,-0.441532,-0.437151,...,-0.456197,-0.403387,0,0.977477,0.084223,0.510155,0.995918,1.000524,1.001139,1.004164


In [302]:
meta_test = df_test_all.loc[:, ['pair_id', 'group_id', 'doc_id']]
data_test = df_test_all.drop(['pair_id', 'group_id', 'doc_id'], 1)

In [303]:
data_test_norm = scaler.transform(data_test)
data_test_norm = meta_test.join(pd.DataFrame(data_test_norm))

In [304]:
data_test_norm

Unnamed: 0,pair_id,group_id,doc_id,0,1,2,3,4,5,6,...,21,22,23,24,25,26,27,28,29,30
0,11691,130,6710,-0.057315,-1.644493,-8.029148,-0.670561,-0.671301,-0.677275,-0.693355,...,-0.696421,-0.510075,1.000000e+00,0.117801,0.742282,-0.264023,0.605369,0.522338,0.432018,-0.055930
1,11692,130,4030,0.758085,-1.644445,-7.848860,-0.630312,-0.627589,-0.623952,-0.625164,...,-0.612686,-0.593345,-9.287505e-15,0.649215,-0.743700,0.504280,0.593949,0.524312,0.432064,-0.061705
2,11693,130,5561,-1.078947,-1.644473,-7.532356,-0.564069,-0.563359,-0.565016,-0.557947,...,-0.549541,-0.673663,1.000000e+00,-0.119790,1.148397,-0.652864,-0.134956,0.527852,0.440178,0.605545
3,11694,130,4055,-1.489548,-1.644506,-8.205429,-0.707455,-0.710553,-0.713759,-0.715761,...,-0.707403,-0.495901,1.000000e+00,-0.293906,1.622003,-1.164832,-0.317368,0.516080,0.417669,-0.148411
4,11695,130,4247,0.515480,-1.644447,-6.662970,-0.407267,-0.417058,-0.441532,-0.437151,...,-0.456197,-0.403387,-9.287505e-15,0.543251,-0.251459,0.041752,0.599097,0.519972,0.420197,-0.220125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16622,28313,309,16637,0.169037,-1.644451,-7.868892,-0.639536,-0.644539,-0.652017,-0.650492,...,-0.653867,-0.546724,-9.287505e-15,0.193968,-0.714590,0.737720,0.109600,0.004227,-0.100021,-0.333367
16623,28314,309,16759,-2.430150,-1.644489,-7.460241,-0.559877,-0.560683,-0.554726,-0.576456,...,-0.572878,-0.614049,1.000000e+00,-2.454890,1.612971,-1.171533,-2.678672,-2.537749,-2.789744,-0.302380
16624,28315,309,15358,-1.743852,-1.644476,-7.199826,-0.522144,-0.545517,-0.539758,-0.531644,...,-0.522087,-0.627179,1.000000e+00,-1.806522,1.059256,-0.615674,-1.886199,-1.906577,-2.132365,-0.352432
16625,28316,309,17287,-2.552707,-1.644508,-7.063609,-0.503696,-0.553546,-0.553791,-0.555998,...,-0.537187,-0.684327,1.000000e+00,-2.592465,1.972965,-1.372656,-2.425885,-2.399547,-2.558576,-0.250099


In [305]:
df_train_norm

Unnamed: 0,pair_id,group_id,target,0,1,2,3,4,5,6,...,21,22,23,24,25,26,27,28,29,30
0,1,1,0,-2.427159,0.225391,-1.476334,-0.566789,-0.589680,-0.610203,-0.606029,...,-0.636714,-0.634179,-0.497732,-1.186846,1.662441,-1.294528,-1.315542,-0.865767,0.410772,-0.283876
1,2,1,0,-0.000276,0.118170,0.506058,-0.510514,-0.520343,-0.542970,-0.553366,...,-0.551895,-0.571576,-0.572021,0.127278,0.261169,-0.130815,0.002057,0.522894,0.429853,0.459930
2,3,1,0,-0.038885,0.229314,-0.631595,-0.629817,-0.659016,-0.663028,-0.664768,...,-0.664500,-0.663990,-0.463350,0.015812,0.522412,-0.170316,-0.105194,0.520295,0.420293,0.142497
3,4,1,0,0.994090,0.455762,0.855804,-0.533774,-0.550567,-0.542970,-0.539188,...,-0.567981,-0.564123,-0.616226,0.718734,-1.168873,1.415468,0.618732,0.517261,0.411741,-0.341340
4,5,1,0,-1.995969,0.120904,-1.320150,-0.536776,-0.550567,-0.550654,-0.543239,...,-0.606004,-0.605859,-0.559742,-0.939854,1.815907,-1.405894,-1.348977,-0.316777,0.421055,-0.206302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,11686,129,0,0.499871,1.525951,1.160268,0.453658,0.610369,0.645117,0.705477,...,0.881261,0.911518,-0.157429,,,,,,,
11686,11687,129,0,0.970103,1.426814,1.160268,0.281833,0.408583,0.444381,0.465456,...,0.599017,0.625333,-0.576151,,,,,,,
11687,11688,129,0,0.970103,1.410766,1.160268,1.127453,1.333954,1.376026,1.393132,...,1.605152,1.637414,1.137417,,,,,,,
11688,11689,129,0,0.038682,1.670141,-0.880121,0.035725,0.100126,0.112062,0.146441,...,0.341634,0.361506,-0.182756,,,,,,,


In [299]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
model.fit(df_train_norm.drop(['target', 'pair_id', 'group_id'], 1), df_train['target'])

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split',
               learning_rate=0.0001, max_depth=3, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100000,
               n_jobs=-1, num_leaves=31, objective=None, random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [306]:
proba = model.predict_proba(df_test_all.drop(['pair_id', 'group_id', 'doc_id'], 1))[:, 1]
predict = (proba > th)
df_test['target'] = predict
df_test['target'] = df_test['target'].apply(lambda x: int(x))
df_test.loc[:, ['pair_id', 'target']].to_csv('predictions/20f_dist_30_05.csv', index=False)