In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, IncrementalPCA, SparsePCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy import sparse
import pickle
from lightgbm import LGBMClassifier
from itertools import combinations
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import scipy

import warnings
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def get_vectors(data, vect_type='count', **voc_params):
    if vect_type == 'count':
        vectorizer = CountVectorizer(**voc_params)
    elif vect_type == 'tfidf':
            vectorizer = TfidfVectorizer(min_df=2)
    data_vect = vectorizer.fit_transform(data.fillna('')).todense()
    scaler = StandardScaler()
    data_vect = scaler.fit_transform(data_vect)
    return data_vect

def make_pca(data_vect):
    pca = PCA()
    pca.fit(data_vect)
    n_c = (pca.explained_variance_ratio_.cumsum() < 0.95).sum()
    pca = PCA(n_components=n_c)
    data_vect = pca.fit_transform(data_vect)
    return data_vect

def get_devs(df, n_vecs):
    df_list = []
    for i in tqdm(df['group_id'].unique(), total=129):
        df_i = df[df['group_id'] == i]
        vect_i = np.array(df_i.loc[:, [f'{j}_f' for j in range(n_vecs)]])
        mean_i = np.mean(vect_i, axis=0)
        df_i['dev'] = np.apply_along_axis(lambda x: cosine(x, mean_i), axis=1, arr=vect_i)
        df_list.append(df_i)
    df_with_dev = pd.concat(df_list)
    df = df_with_dev.drop([f'{i}_f' for i in range(n_vecs)], 1)
    return df

def show_hist(df, column='dev'):
    plt.hist(df[df['target'] == 0][column].fillna(0), bins=100, normed=True, label='Out of group')
    plt.hist(df[df['target'] == 1][column].fillna(0), bins=100, normed=True, alpha=0.7, label='In group')
    plt.title('dev distribution')
    plt.legend()

def get_th(proba, y_test):
    scores = []
    ths = np.arange(0.1, 0.9, 0.1)
    for i in ths:
        scores.append(f1_score(y_test, (proba > i)[:, 1]))
    ind = scores.index(max(scores))
    th = np.arange(0.1, 0.9, 0.1)[ind]
    score = f1_score((proba > th)[:, 1], y_test)
    return score, th

# Train headers devs

In [3]:
df_train = pd.read_csv('data/train_groups.csv')

In [13]:
# headers = pd.read_csv('data/all_headers.tsv', sep='\t')
# voc_params = {'min_df': 5, 'max_df': 0.8}
# vect_headers = get_vectors(headers['headers'], **voc_params)
# headers = headers.join(pd.DataFrame(vect_headers, columns=[f'{i}_f' for i in range(vect_headers.shape[1])]))
df_train_headers = pd.merge(df_train, headers, how='left', on='doc_id')

In [5]:
df_train_headers_devs = get_devs(df_train_headers, headers.shape[1] - 2)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




KeyError: "['clean_title'] not found in axis"

In [6]:
df_train_headers_devs = df_train_headers_devs.drop(['group_id', 'doc_id', 'target', 'headers'], 1)

# Test headers devs

In [12]:
df_test = pd.read_csv('data/test_groups.csv')
df_test_headers = pd.merge(df_test, headers, how='left', on='doc_id')
df_test_headers_devs = get_devs(df_test_headers, headers.shape[1] - 2)

In [11]:
df_test_headers_devs = df_test_headers_devs.drop(['group_id', 'doc_id', 'headers'], 1)

# Train headers dist

In [14]:
data = []
for group_id in tqdm(range(1, 129), total=128):
    sample = df_train_headers[df_train_headers['group_id'] == group_id]
    summary = np.array(sample.loc[:, [f'{i}_f' for i in range(4222)]])
    pairwise = pd.DataFrame(
        squareform(pdist(summary, metric='cosine')),
        columns = sample['pair_id'],
        index = sample['pair_id']
    )
    pairwise = pairwise.replace(0, np.nan)
    pairwise = pairwise.describe().T
    pairwise = pairwise.reset_index()
    data.append(pairwise)
df_train_headers_data = pd.concat(data)

HBox(children=(IntProgress(value=0, max=128), HTML(value='')))




In [15]:
df_train_headers_dist = pd.merge(df_train, df_train_headers_data, how='left', on='pair_id')

# Test headers dist

In [20]:
data = []
groups = df_test_headers['group_id'].unique()
for group_id in tqdm(groups, total=groups.shape[0]):
    sample = df_test_headers[df_test_headers['group_id'] == group_id]
    summary = np.array(sample.loc[:, [f'{i}_f' for i in range(4222)]])
    pairwise = pd.DataFrame(
        squareform(pdist(summary, metric='cosine')),
        columns = sample['pair_id'],
        index = sample['pair_id']
    )
    pairwise = pairwise.replace(0, np.nan)
    pairwise = pairwise.describe().T
    pairwise = pairwise.reset_index()
    data.append(pairwise)
df_test_headers_data = pd.concat(data)

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))




In [21]:
df_test_headers_dist = pd.merge(df_test, df_test_headers_data, how='left', on='pair_id')

# collect everything

In [22]:
df_train_headers_devs.head()

Unnamed: 0,pair_id,dev
0,1,0.53927
1,2,0.901806
2,3,0.693755
3,4,0.965767
4,5,0.567833


In [23]:
df_train_headers_dist.head()

Unnamed: 0,pair_id,group_id,doc_id,target,count,mean,std,min,25%,50%,75%,max
0,1,1,15731,0,95.0,0.71496,0.363963,0.018718,0.21976,0.93926,0.98512,1.019248
1,2,1,14829,0,95.0,0.71496,0.363963,0.018718,0.21976,0.93926,0.98512,1.019248
2,3,1,15764,0,82.0,0.81418,0.312631,0.01784,0.804409,0.967244,0.997562,1.041416
3,4,1,17669,0,82.0,0.81418,0.312631,0.01784,0.804409,0.967244,0.997562,1.041416
4,5,1,14852,0,82.0,0.81418,0.312631,0.01784,0.804409,0.967244,0.997562,1.041416


In [24]:
df_train_headers_all = pd.merge(df_train_headers_devs, df_train_headers_dist, how='inner', on='pair_id')
df_test_headers_all = pd.merge(df_test_headers_devs, df_test_headers_dist, how='inner', on='pair_id')

In [25]:
df_test_titles_all = pd.read_csv('data/all_test_30_05.csv')
df_train_titles_all = pd.read_csv('data/all_train_30_05.csv')

In [30]:
df_train_headers_all.head()

Unnamed: 0,pair_id,dev,count,mean,std,min,25%,50%,75%,max
0,1,0.53927,95.0,0.71496,0.363963,0.018718,0.21976,0.93926,0.98512,1.019248
1,2,0.901806,95.0,0.71496,0.363963,0.018718,0.21976,0.93926,0.98512,1.019248
2,3,0.693755,82.0,0.81418,0.312631,0.01784,0.804409,0.967244,0.997562,1.041416
3,4,0.965767,82.0,0.81418,0.312631,0.01784,0.804409,0.967244,0.997562,1.041416
4,5,0.567833,82.0,0.81418,0.312631,0.01784,0.804409,0.967244,0.997562,1.041416


In [28]:
df_train_headers_all = df_train_headers_all.drop(['group_id', 'target', 'doc_id'], 1)
df_test_headers_all = df_test_headers_all.drop(['group_id', 'doc_id'], 1)

In [31]:
df_train_all = pd.merge(df_train_titles_all, df_train_headers_all, how='left', on='pair_id')
df_test_all = pd.merge(df_test_titles_all, df_test_headers_all, how='left', on='pair_id')

In [33]:
df_train_all = df_train_all.drop(['count_x', 'count_y'], 1)
df_test_all = df_test_all.drop(['count_x', 'count_y'], 1)

In [34]:
train_subset = list(df_train_all.drop(['pair_id', 'group_id', 'doc_id', 'target'], 1).columns)

In [None]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
score, th = linear_model_cross_val_score(model, df_train_all, train_subset=train_subset, cv=5)
print(score, th)