In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, IncrementalPCA, SparsePCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy import sparse
import pickle
from lightgbm import LGBMClassifier
from itertools import combinations
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import scipy

import warnings
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def get_vectors(data, vect_type='count', **voc_params):
    if vect_type == 'count':
        vectorizer = CountVectorizer(**voc_params)
    elif vect_type == 'tfidf':
            vectorizer = TfidfVectorizer(min_df=2)
    data_vect = vectorizer.fit_transform(data.fillna('')).todense()
    scaler = StandardScaler()
    data_vect = scaler.fit_transform(data_vect)
    return data_vect

def make_pca(data_vect):
    pca = PCA()
    pca.fit(data_vect)
    n_c = (pca.explained_variance_ratio_.cumsum() < 0.95).sum()
    pca = PCA(n_components=n_c)
    data_vect = pca.fit_transform(data_vect)
    return data_vect

def get_devs(df, n_vecs):
    df_list = []
    for i in tqdm(df['group_id'].unique(), total=129):
        df_i = df[df['group_id'] == i]
        vect_i = np.array(df_i.loc[:, [f'{j}_f' for j in range(n_vecs)]])
        mean_i = np.mean(vect_i, axis=0)
        df_i['dev'] = np.apply_along_axis(lambda x: cosine(x, mean_i), axis=1, arr=vect_i)
        df_list.append(df_i)
    df_with_dev = pd.concat(df_list)
    df = df_with_dev.drop([f'{i}_f' for i in range(n_vecs)], 1)
    return df

def show_hist(df, column='dev'):
    plt.hist(df[df['target'] == 0][column].fillna(0), bins=100, normed=True, label='Out of group')
    plt.hist(df[df['target'] == 1][column].fillna(0), bins=100, normed=True, alpha=0.7, label='In group')
    plt.title('dev distribution')
    plt.legend()

def get_th(proba, y_test):
    scores = []
    ths = np.arange(0.1, 0.9, 0.1)
    for i in ths:
        scores.append(f1_score(y_test, (proba > i)[:, 1]))
    ind = scores.index(max(scores))
    th = np.arange(0.1, 0.9, 0.1)[ind]
    score = f1_score((proba > th)[:, 1], y_test)
    return score, th

# train dists preparing

In [7]:
df_train = pd.read_csv('data/train_groups.csv')

In [5]:
titles = pd.read_csv('data/clean_titles.csv')

In [6]:
voc_params = {'min_df': 5, 'max_df': 0.8}
vect_titles = get_vectors(titles['clean_title'], **voc_params)

In [9]:
titles = titles.join(pd.DataFrame(vect_titles, columns=[f'{i}_f' for i in range(vect_titles.shape[1])]))

In [10]:
df_train_titles = pd.merge(df_train, titles, how='left', on='doc_id')

In [14]:
data = []
for group_id in tqdm(range(1, 129), total=128):
    sample = df_train_titles[df_train_titles['group_id'] == group_id]
    summary = np.array(sample.loc[:, [f'{i}_f' for i in range(4222)]])
    pairwise = pd.DataFrame(
        squareform(pdist(summary, metric='cosine')),
        columns = sample['pair_id'],
        index = sample['pair_id']
    )
    pairwise = pairwise.replace(0, np.nan)
    pairwise = pairwise.describe().T
    pairwise = pairwise.reset_index()
    data.append(pairwise)
df_train_data = pd.concat(data)

HBox(children=(IntProgress(value=0, max=128), HTML(value='')))




In [17]:
df_train_dist = pd.merge(df_train, df_train_data, how='left', on='pair_id')

In [19]:
df_train_dist.to_csv('data/dist_train.csv', index=False)

# test dists preparing

In [20]:
df_test = pd.read_csv('data/test_groups.csv')

In [21]:
df_test_titles = pd.merge(df_test, titles, how='left', on='doc_id')

In [22]:
data = []
groups = df_test_titles['group_id'].unique()
for group_id in tqdm(groups, total=groups.shape[0]):
    sample = df_test_titles[df_test_titles['group_id'] == group_id]
    summary = np.array(sample.loc[:, [f'{i}_f' for i in range(4222)]])
    pairwise = pd.DataFrame(
        squareform(pdist(summary, metric='cosine')),
        columns = sample['pair_id'],
        index = sample['pair_id']
    )
    pairwise = pairwise.replace(0, np.nan)
    pairwise = pairwise.describe().T
    pairwise = pairwise.reset_index()
    data.append(pairwise)
df_test_data = pd.concat(data)

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))




In [24]:
df_test_dist = pd.merge(df_test, df_test_data, how='left', on='pair_id')

# train devs preparing

In [28]:
df_train_devs = get_devs(df_train_titles, titles.shape[1] - 2)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [31]:
df_train_devs = df_train_devs.drop(['group_id', 'doc_id', 'target', 'clean_title'], 1)

In [40]:
df_train_dist_devs = pd.merge(df_train_devs, df_train_dist, how='inner', on='pair_id')

In [42]:
df_train_dist_devs.to_csv('data/dist_devs_train.csv', index=False)

# test devs preparing

In [46]:
df_test_devs = get_devs(df_test_titles, titles.shape[1] - 2)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [48]:
df_test_devs = df_test_devs.drop(['group_id', 'doc_id', 'clean_title'], 1)

In [49]:
df_test_dist_devs = pd.merge(df_test_devs, df_test_dist, how='inner', on='pair_id')

In [50]:
df_test_dist_devs.to_csv('data/dist_devs_test.csv', index=False)

# Adding train word intersection

In [55]:
df_train_int = pd.read_csv('/Users/michelle/Downloads/fich_20_coins_1_avg.csv')

In [56]:
df_train_int = df_train_int.drop(['Unnamed: 0', 'target'], 1)

In [57]:
df_train_all = df_train_dist_devs.join(df_train_int)

# Adding test word intersection

In [60]:
df_test_int = pd.read_csv('/Users/michelle/Downloads/fich_20_coins_1_avg_test.csv')

In [61]:
df_test_int = df_test_int.drop(['Unnamed: 0'], 1)
df_test_all = df_test_dist_devs.join(df_test_int)

# Model work

In [67]:
df_train_data = df_train_all.drop(['pair_id', 'group_id', 'doc_id', 'target', 'count'], 1)
df_train_metadata = df_train_all.loc[:, ['pair_id', 'group_id', 'doc_id', 'target']]
df_test_data = df_test_all.drop(['pair_id', 'group_id', 'doc_id', 'count'], 1)
df_test_metadata = df_test_all.loc[:, ['pair_id', 'group_id', 'doc_id']]

In [73]:
def linear_model_cross_val_score(model, df, train_subset=['dev'], cv=5):
    indices = df['group_id'].unique()
    result = []
    ths = 0
    kf = KFold(n_splits=cv, shuffle=True)
    
    for train_ids, test_ids in kf.split(indices):
        train_data = df[df['group_id'].isin(train_ids)]
        test_data = df[df['group_id'].isin(test_ids)]
        model.fit(train_data.loc[:, train_subset].fillna(0), train_data['target'])
        proba = model.predict_proba(test_data.loc[:, train_subset].fillna(0))
        score, th = get_th(proba, test_data['target'])
        ths += th
        result.append(score)
    return sum(result) / len(result), ths / len(result)

In [74]:
train_subset = list(df_train_all.columns)
train_subset.remove('pair_id')
train_subset.remove('group_id')
train_subset.remove('doc_id')
train_subset.remove('target')
train_subset.remove('count')

In [75]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
score, th = linear_model_cross_val_score(model, df_train_all, train_subset=train_subset, cv=5)
print(score, th)

0.7503680460283206 0.56


In [76]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
model.fit(df_train_all.loc[:, train_subset], df_train_all['target'])

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split',
               learning_rate=0.0001, max_depth=3, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100000,
               n_jobs=-1, num_leaves=31, objective=None, random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [77]:
proba = model.predict_proba(df_test_all.loc[:, train_subset])[:, 1]
predict = (proba > th)
df_test_all['target'] = predict
df_test_all['target'] = df_test_all['target'].apply(lambda x: int(x))
df_test_all.loc[:, ['pair_id', 'target']].to_csv('predictions/all_30_05.csv', index=False)

# Scaller

In [81]:
scaler = StandardScaler()
df_train_data_norm = scaler.fit_transform(df_train_data)
df_test_data_norm = scaler.transform(df_test_data)
df_train_all_norm = df_train_metadata.join(pd.DataFrame(df_train_data_norm))
df_test_all_norm = df_test_metadata.join(pd.DataFrame(df_test_data_norm))

In [85]:
train_subset_norm = list(df_train_all_norm.drop(['pair_id', 'group_id', 'doc_id', 'target'], 1).columns)

In [91]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
score, th = linear_model_cross_val_score(model, df_train_all_norm, train_subset=train_subset_norm, cv=5)
print(score, th)

0.7474013767351569 0.56


In [94]:
model = LGBMClassifier(class_weight='balanced', n_estimators=100000, max_depth=3, learning_rate=0.0001)
model.fit(df_train_all_norm.loc[:, train_subset_norm], df_train_all_norm['target'])
proba = model.predict_proba(df_test_all_norm.loc[:, train_subset_norm])[:, 1]
predict = (proba > th)
df_test_all_norm['target'] = predict
df_test_all_norm['target'] = df_test_all_norm['target'].apply(lambda x: int(x))
df_test_all_norm.loc[:, ['pair_id', 'target']].to_csv('predictions/all_norm_30_05.csv', index=False)

# Headers dist train

In [98]:
headers = pd.read_csv('data/all_headers.tsv', sep='\t')
voc_params = {'min_df': 5, 'max_df': 0.8}
vect_headers = get_vectors(headers['headers'], **voc_params)
headers = headers.join(pd.DataFrame(vect_headers, columns=[f'{i}_f' for i in range(vect_headers.shape[1])]))
df_train_headers = pd.merge(df_train, headers, how='left', on='doc_id')

In [99]:
del vect_titles

In [102]:
del df_train_titles
del df_test_titles

In [104]:
df_test_all.to_csv('data/all_test_30_05.csv', index=False)
df_train_all.to_csv('data/all_train_30_05.csv', index=False)