In [None]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.metrics import pairwise_distances
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from copy import deepcopy
from bs4 import BeautifulSoup

In [None]:
count = 0
doc_to_title = {}
with open('docs_titles.tsv') as f:
    next(f)
    for line in f:
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
            count += 1
        else:
            title = data[1]
        doc_to_title[doc_id] = title

print(len(doc_to_title))
print(count)

In [None]:
stemmer_rus = SnowballStemmer("russian")
stemmer_eng = SnowballStemmer("english")
stop_words = set(stopwords.words(["russian", "english"]))

In [None]:
def split_title(title):
    title = title.lower()
    words = [stemmer_rus.stem(stemmer_eng.stem(word)) for word in re.sub('[^a-zа-я0-9]', ' ', title).split() 
                 if not word in stop_words]
    return [word for word in words if not word in stop_words]

In [None]:
def titles_to_words(titles_dict):
    words_dict = {}
    for ID, title in titles_dict.items():
        words_dict[ID] = split_title(title)
    return words_dict

In [None]:
def get_top_n_words(corpus, n=10):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
def get_content(ID):
    html = ''
    with open("content/{}.dat".format(ID)) as f:
        for line in f:
            html += line
    soup = BeautifulSoup(html)
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.body.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    trytext = split_title(text)
    return get_top_n_words(trytext, 10)

In [None]:
class Extractor:

    def __init__(self, doc_to_title):
        
        self.n_features = 80
        doc_to_title[0] = ''
        mydoc = deepcopy(doc_to_title)
        for i in tqdm(range(1, len(mydoc))):
            for item in get_content(i):
                mydoc[i] += ' ' + item[0]
        self.doc_to_matrix = TfidfVectorizer().fit_transform(
            [' '.join(mydoc[i]) for i in range(len(mydoc))])

    def cosine(self, group):
        n = self.n_features//4
        X = np.empty(shape=(group.size, self.n_features), dtype=np.float)
        for i, all_dist in enumerate(pairwise_distances(self.doc_to_matrix[group], metric='cosine')):
            X[i, :n] = sorted(all_dist)[1:n + 1]
        X[:, n:2*n] = np.mean(X[:, :n], axis=0)
        X[:, 2*n:3*n] = np.std(X[:, :n], axis=0)
        X[:, 3*n:] = np.median(X[:, :n], axis=0)
        return X
            
    def extract(self, file):
        df = pd.read_csv(file)
        groups = df.groupby('group_id')
        X = np.empty(shape=(df.shape[0], self.n_features), dtype=np.float)
        if 'target' in df.columns:
            y = np.empty(shape=(df.shape[0], ), dtype=bool)
            i = 0
            for group_id, group_idx in groups.groups.items():
                j = i + group_idx.size
                group = df.iloc[group_idx]
                y[i:j] = group.target
                X[i:j] = self.cosine(group.doc_id)
                i = j

            return X, y
        else:
            i = 0
            for group_id, group_idx in groups.groups.items():
                j = i + group_idx.size
                group = df.iloc[group_idx]
                X[i:j] = self.cosine(group.doc_id)
                i = j

            return X

In [None]:
vec = titles_to_words(doc_to_title)
extractor = Extractor(vec)
X_train, y_train = extractor.extract('train_groups.csv')
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test, pair_ids = extractor.extract('test_groups.csv')
X_test = scaler.transform(X_test)

In [None]:
params = {'learning_rate': 0.1, 'objective': 'binary', 'num_iterations': 500,
        'max_bin': 30, 'num_leaves': 7, 'max_depth': 20, 'boosting': 'dart'}
thres = np.linspace(0.2, 0.5, 30)

In [None]:
tres_scores = np.empty((129, 30))
tres_lgb = np.empty((129, 30))
tres_log = np.empty((129, 30))
tres_trees = np.empty((129, 30))
for i in range(1, 130):
    ind_test = np.where(groups_train == i)
    ind_train = np.where(groups_train != i)
    X = X_train[ind_train]
    X_tt = X_train[ind_test]
    y = y_train[ind_train]
    y_tt = y_train[ind_test]
    clf = xgb.XGBClassifier(objective='binary:logistic')
    clf_lgb = lgb.train(params, lgb.Dataset(X, y))
    clf_log = LogisticRegression(solver='lbfgs', max_iter=2000, C=10.0).fit(X, y)
    clf_trees = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=0, n_jobs=-1).fit(X, y)
    clf.fit(X, y)
    my_pred = clf.predict_proba(X_tt)
    my_pred_lgb = clf_lgb.predict(X_tt)
    my_pred_log = clf_log.predict(X_tt)
    my_pred_trees = clf_trees.predict(X_tt)
    y_pred = my_pred[:, 1]
    for t in range(30):
        threshold = thres[t]
        pred = np.empty_like(y_pred, dtype=bool)
        pred[y_pred > threshold] = True
        pred[y_pred <= threshold] = False
        pred_lgb = my_pred_lgb > threshold
        pred_log = my_pred_log > threshold
        pred_trees = my_pred_trees > threshold
        tres_lgb[i-1][t] = f1_score(y_tt, pred_lgb)
        tres_log[i-1][t] = f1_score(y_tt, pred_log)
        tres_trees[i-1][t] = f1_score(y_tt, pred_trees)
        tres_scores[i-1][t] = f1_score(y_tt, pred)
m = np.asarray(tres_scores).mean(axis=0)
m_lgb = np.asarray(tres_lgb).mean(axis=0)
m_log = np.asarray(tres_log).mean(axis=0)
m_trees = np.asarray(tres_trees).mean(axis=0)

In [None]:
pred1 = lgb.train(params, lgb.Dataset(X_train, y_train)).predict(X_test) > 0.4
pred2 = LogisticRegression(solver='lbfgs', max_iter=2000, C=20.0).fit(X_train, y_train).predict(X_test) > 0.4
pred3 = xgb.XGBClassifier(objective='binary:logistic', base_score=0.4).fit(X_train, y_train).predict(X_test)
pred4 = RandomForestClassifier().fit(X_train, y_train).predict(X_test) > 0.4

In [None]:
pred1.sum(), pred2.sum(), pred3.sum(), pred4.sum()

In [None]:
pred1 = np.array(pred1, dtype=int)
pred2 = np.array(pred2, dtype=int)
pred3 = np.array(pred3, dtype=int)
pred4 = np.array(pred4, dtype=int)

In [None]:
vote = (pred1 + pred2 + pred3 + pred4) / 4

In [None]:
vote = vote > 0.4

In [None]:
y_pred = vote

In [None]:
y_pred.sum()

In [None]:
y_pred = np.array(y_pred, dtype=np.int)
with open('sample_submission.csv', 'w') as f:
    fieldnames = ['pair_id', 'target']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    i = 11691
    for elem in y_pred:
        writer.writerow({'pair_id': str(i), 'target': str(elem)})
        i += 1
f.close()