# Анализ веб документов

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from copy import copy
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from collections import defaultdict
from scipy.spatial import distance
from sklearn.metrics import pairwise_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import DBSCAN
from bs4 import BeautifulSoup as bs
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import numpy as np
import re
import multiprocessing
import codecs

### Обработка текста
Выполняется удаление пунктуации и неподходящих символов, лемматизация, приведение к нижнему регистру и удаление стоп-слов.
Код использовался для составления таблиц.

In [2]:
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
stop_tokens = []

def is_word(token):
    legal_chars = '0123456789abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя'
    for c in legal_chars:
        if c in token:
            return True
    stop_tokens.append(token)
    return False

def preprocess_text(text):
    text = str(text)
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation
              and is_word(token)]
    
    text = " ".join(tokens)
    
    return text

doc_to_title = {}
with codecs.open('docs_titles.tsv', 'r', 'utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title

def add_title_to_df_and_save_as_csv(file_name):
    df = pd.read_csv(file_name)
    df['title'] = pd.Series(dtype='object')
    for i, row in df.iterrows():
        df['title'][i] = doc_to_title[row['doc_id']]
    df['title'] = df['title'].map(preprocess_text)
    df.to_csv(f'{file_name}_train_preprocessed.csv', index=False)
    return df

In [3]:
lem_titles = pd.read_csv('docs_titles.tsv', sep='\t', encoding='utf-8', lineterminator='\n')
train_groups = pd.read_csv("train_groups.csv")
test_groups = pd.read_csv("test_groups.csv")

In [4]:
lem_titles

Unnamed: 0,doc_id,title
0,15731,ВАЗ 21213 | Замена подшипников ступицы | Нива
1,14829,"Ваз 2107 оптом в Сочи. Сравнить цены, купить п..."
2,15764,Купить ступица Лада калина2. Трансмиссия - пер...
3,17669,Классика 21010 - 21074
4,14852,Ступица Нива — замена подшипника своими руками
...,...,...
27945,16637,Ответы@Mail.Ru: полезно ли кушать творог по ут...
27946,16759,Творог. Полезные свойства и лечение творогом. ...
27947,15358,Творог - Полезные и опасные свойства творога
27948,17287,Ответы@Mail.Ru: Чем полезен творог?


In [5]:
train_groups

Unnamed: 0,pair_id,group_id,doc_id,target
0,1,1,15731,0
1,2,1,14829,0
2,3,1,15764,0
3,4,1,17669,0
4,5,1,14852,0
...,...,...,...,...
11685,11686,129,26672,0
11686,11687,129,25838,0
11687,11688,129,25703,0
11688,11689,129,27885,0


In [6]:
pre_train_groups = pd.read_csv("new_new_pre_train_groups.tsv", sep='\t', encoding = 'utf-8')
pre_test_groups = pd.read_csv("new_new_pre_test_groups.tsv", sep='\t', encoding = 'utf-8')
pre_train_groups.head()

Unnamed: 0,pair_id,group_id,doc_id,target,title,url,img_count,link_count
0,1,1,15731,0,ваз 21213 замена подшипник ступица нива,automn vaz 21213 vaz 34656 10 4460,31.0,30.0
1,2,1,14829,0,ваз 2107 опт сочи сравнивать цена купить потре...,sochi vaz 2107 wholesale,55.0,261.0
2,3,1,15764,0,купить ступица лад калина2 трансмиссия переход...,baza sell spare parts,43.0,136.0
3,4,1,17669,0,классика 21010 21074,carobka 21010,4.0,15.0
4,5,1,14852,0,ступица нива замена подшипник свой рука,cartore 2730 stupica niva,10.0,65.0


### Обработка пустых полей
Как было замечено, пустые поля появляются только в заголовках документах, поэтому только там и будем исправлять.

In [7]:
print(pre_train_groups["title"].isna().sum(),
    pre_test_groups["title"].isna().sum())

18 23


In [8]:
pre_train_groups.fillna("", inplace = True)
pre_test_groups.fillna("", inplace = True)

print(pre_train_groups["title"].isna().sum(), 
      pre_test_groups["title"].isna().sum())

0 0


In [9]:
pre_train_groups.head()

Unnamed: 0,pair_id,group_id,doc_id,target,title,url,img_count,link_count
0,1,1,15731,0,ваз 21213 замена подшипник ступица нива,automn vaz 21213 vaz 34656 10 4460,31.0,30.0
1,2,1,14829,0,ваз 2107 опт сочи сравнивать цена купить потре...,sochi vaz 2107 wholesale,55.0,261.0
2,3,1,15764,0,купить ступица лад калина2 трансмиссия переход...,baza sell spare parts,43.0,136.0
3,4,1,17669,0,классика 21010 21074,carobka 21010,4.0,15.0
4,5,1,14852,0,ступица нива замена подшипник свой рука,cartore 2730 stupica niva,10.0,65.0


In [10]:
pre_test_groups.head()

Unnamed: 0,pair_id,group_id,doc_id,title,url,img_count,link_count
0,11691,130,6710,прописывать админка кс 1.6 друг youtube,youtube watch,23.0,11.0
1,11692,130,4030,скачать sgl rp доработка слива мода mysql rp r...,v sampe load 1 role play sgl rp dorabotka mys...,150.0,42.0
2,11693,130,5561,прописывать админка кс 1.6 counter strike ката...,dream x publ kak propisat adminku v ks 1 6 1 ...,82.0,79.0
3,11694,130,4055,прописывать простой админка кс 1 6,kak propisat prostuyu adminku v ks 1 6,14.0,39.0
4,11695,130,4247,подбор админ сервер код 4 архив форум ozone,o3one forum archive index t 47527,0.0,6.0


### TFIDF 
Для урлов и для заголовков обучим по векторайзеру.

In [11]:
title_data_train = pre_train_groups["title"].tolist()
title_data_test = pre_test_groups["title"].tolist()

tfidf_title = TfidfVectorizer(ngram_range = (1, 1))
tf_title_data = tfidf_title.fit_transform(title_data_train + title_data_test)

url_data_train = pre_train_groups["url"].tolist()
url_data_test = pre_test_groups["url"].tolist()

tfidf_url = TfidfVectorizer(ngram_range = (1, 1))
tf_url_data = tfidf_url.fit_transform(url_data_train + url_data_test)

tf_title_data.shape, tf_url_data.shape

((28317, 27248), (28317, 45507))

In [12]:
pre_test_groups.shape, pre_train_groups.shape

((16627, 7), (11690, 8))

In [13]:
group_id_train = train_groups["group_id"].to_numpy()
y_train = train_groups["target"].to_numpy()

group_id_test = test_groups["group_id"].to_numpy()

In [14]:
groups_pair_ids = defaultdict(list)

In [15]:
for pair_id, group_id in zip(train_groups["pair_id"], train_groups["group_id"]):
    groups_pair_ids[group_id].append(pair_id)
for pair_id, group_id in zip(test_groups["pair_id"], test_groups["group_id"]):
    groups_pair_ids[group_id].append(pair_id)

In [16]:
X_new = []
amount = 25

for group_id, values in groups_pair_ids.items():
    indexes = np.array(values) - 1
    d = pairwise_distances(tf_title_data[indexes], metric="cosine")
    d_url = pairwise_distances(tf_url_data[indexes], metric="cosine") 
        
    dn = np.array(d)
    dn = np.sort(dn, axis=1)[:, 1:amount]
    
    dn_url = np.array(d_url)
    dn_url = np.sort(dn_url, axis=1)[:, 1:amount]
    
    mean = np.mean(dn)
    mean_url = np.mean(dn_url)
    
    if (indexes[0] >= pre_train_groups.shape[0]):
        doc_img_part = pre_test_groups.img_count[indexes - pre_train_groups.shape[0]].to_numpy()
        doc_link_part = pre_test_groups.link_count[indexes - pre_train_groups.shape[0]].to_numpy()
    else:
        doc_img_part = pre_train_groups.img_count[indexes].to_numpy()
        doc_link_part = pre_train_groups.link_count[indexes].to_numpy()
            
    for v_id, i in enumerate(d):
        sorted_arr = sorted(i)[1:amount]
        sorted_arr_url = sorted(d_url[v_id])[1:amount]
        stats = np.array([mean])
        stats_url = np.array([mean_url])

        all_img_dist = []
        all_link_dist = []
        image = doc_img_part[v_id]
        link = doc_link_part[v_id]
        for j in range(len(doc_img_part)):
            if v_id == j:
                continue
                
            img_j = doc_img_part[j]
            link_j = doc_link_part[j]
            
            if (img_j + image == 0):
                all_img_dist.append(0)
            else:
                all_img_dist.append(abs(img_j - image) / (img_j + image))
                
            if (link_j + link == 0):
                all_link_dist.append(0)
            else:
                all_link_dist.append(abs(link_j - link) / (link_j + link))
                
            
        sorted_img_count = sorted(all_img_dist, reverse=True)[0:amount]
        sorted_link_count = sorted(all_link_dist, reverse=True)[0:amount]
        
        sorted_arr = np.concatenate([sorted_arr, stats, sorted_arr_url, stats_url, sorted_img_count, sorted_link_count])
        X_new.append(sorted_arr)

X_new = np.array(X_new)

In [17]:
X_new.shape

(28317, 100)

In [18]:
X_train = X_new[:len(title_data_train)]
X_test = X_new[len(title_data_train):]

In [19]:
X_test.shape, X_train.shape

((16627, 100), (11690, 100))

### Подберем параметры GB

In [20]:
clf = GradientBoostingClassifier()
parameters = {'learning_rate':[0.01, 0.05, 0.1, 0.5, 1]}
cv_test = KFold(n_splits = 5)
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv = cv_test)
clf.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.5, 1]})

In [21]:
best_rate = clf.best_params_['learning_rate']
best_rate

0.1

In [22]:
clf = GradientBoostingClassifier()
parameters = {'max_depth': [2, 3, 5, 10]}
cv_test = KFold(n_splits = 5)
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=cv_test)
clf.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(),
             param_grid={'max_depth': [2, 3, 5, 10]})

In [23]:
best_max_depth = clf.best_params_['max_depth']
best_max_depth

3

In [24]:
clf = GradientBoostingClassifier()
parameters = {'min_samples_split':[2, 5, 10, 20]}
cv_test = KFold(n_splits = 5)
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv = cv_test)
clf.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(),
             param_grid={'min_samples_split': [2, 5, 10, 20]})

In [25]:
best_min_samples = clf.best_params_['min_samples_split']
best_min_samples

2

In [26]:
best_min_samples,best_max_depth, best_rate

(2, 3, 0.1)

### Обучаем GB

In [27]:
tmp = []
clf = GradientBoostingClassifier(min_samples_split = best_min_samples, max_depth = best_max_depth, 
                                 learning_rate = best_rate)
# (min_samples_split = 10, max_depth = 3, learning_rate = 0.05)
for train_index, test_index in KFold(5).split(X_train, y_train):
    clf.fit(X_train[train_index], y_train[train_index])
    tmp.append(f1_score(y_train[test_index], clf.predict(X_train[test_index])))
tmp

[0.7653429602888087,
 0.7658703071672356,
 0.7512388503468781,
 0.6683333333333333,
 0.7884362680683312]

In [28]:
np.mean(tmp)

0.7478443438409175

In [29]:
X_train.shape

(11690, 100)

In [30]:
clf.fit(X_train, y_train)
result = clf.predict(X_test)

In [31]:
df = pd.DataFrame({"pair_id": [i for i in test_groups["pair_id"]], "target": result})

In [32]:
df.to_csv("with_img_link_answer.csv", index = False)

XGBOOST


In [28]:
from xgboost import XGBClassifier

In [30]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

cv_test = KFold(n_splits = 5)
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 4,
    cv = cv_test,
    verbose=True
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 10.5min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 25.7min
[Parallel(n_jobs=4)]: Done 480 out of 480 | elapsed: 29.4min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, m...
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=42,
                      

In [31]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=42, subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [32]:
tmp = []
for train_index, test_index in KFold(5).split(X_train, y_train):
    grid_search.best_estimator_.fit(X_train[train_index], y_train[train_index])
    tmp.append(f1_score(y_train[test_index], grid_search.best_estimator_.predict(X_train[test_index])))
tmp

[0.7509349289454003,
 0.759753593429158,
 0.7138755980861244,
 0.6599496221662469,
 0.79155672823219]

In [33]:
np.mean(tmp)

0.7352140941718239

In [34]:
grid_search.best_estimator_.fit(X_train, y_train)
result = grid_search.best_estimator_.predict(X_test)

In [35]:
df = pd.DataFrame({"pair_id": [i for i in test_groups["pair_id"]], "target": result})

In [36]:
df.to_csv("xgboost_answer.csv", index = False)







# Парсинг сайта, создание файлов с url, image_count, link_count
Запускать не советуется :) Каждая ячейка выполняется около часа.

In [None]:
%%time

# Делаем собственный парсинг, с учетом url

path = 'content/'
count = 28026

title_url_data = pd.DataFrame(index = list(range(count)), columns = ['doc_id', 'title', 'url'])
title_url_data.loc[0] = 1

counters = list(range(1, count + 1))
file_name = [path + '{}.dat'.format(i) for i in counters]

for i in tqdm(counters): 
    with codecs.open(file_name[i - 1], 'r', 'utf-8') as f:
        url = f.readline().strip()
        url_new = re.split(r'\?|%.*', url)
        url_new = re.sub("\.\w*", " ", url_new[0]) 

        title = BeautifulSoup(f, 'lxml').title.text
        title = re.sub("\n*|\t*|\r*", "", title) 
        
        title_url_data.loc[i - 1] = [i, title, url_new]
title_url_data

In [None]:
%%time

# Делаем собственный парсинг, с учетом images, links
path = 'content/'
count = 28026

image_link_data = pd.DataFrame(index = list(range(count)), columns = ['doc_id', 'image_count', 'link_count'])
image_link_data.loc[0] = 1

counters = list(range(1, count + 1))
file_name = [path + '{}.dat'.format(i) for i in counters]

for i in counters: 
    with codecs.open(file_name[i - 1], 'r', 'utf-8') as f:
        soup = BeautifulSoup(f, 'lxml')
        
        image_count = len(soup.find_all("img"))
        link_count = 0
        for link in soup.find_all('a'):
            html_link = link.get('href')
            if (html_link):
                result = re.match(r'http', html_link) 
                if (result):
                    link_count += 1
        image_link_data.loc[i - 1] = [i, image_count, link_count]
image_link_data

In [None]:
# Присоединяем к таблице с url значения img_count, link_count

pre_train_groups = pre_train_groups.reindex(columns = pre_train_groups.columns.tolist() + ['img_count', 'link_count'])
pre_test_groups = pre_test_groups.reindex(columns = pre_test_groups.columns.tolist() + ['img_count', 'link_count'])

for i in range(len(pre_train_groups)):
    raw = pre_train_groups.loc[i]
    doc_id = raw.doc_id
    pre_train_groups.img_count[i] = image_data.loc[doc_id - 1].image_count
    pre_train_groups.link_count[i] = image_data.loc[doc_id - 1].link_count

pre_train_groups.to_csv("new_new_pre_train_groups.tsv", sep="\t", index = False)

for i in range(len(pre_test_groups)):
    raw = pre_test_groups.loc[i]
    doc_id = raw.doc_id
    pre_test_groups.img_count[i] = image_data.loc[doc_id - 1].image_count
    pre_test_groups.link_count[i] = image_data.loc[doc_id - 1].link_count

pre_test_groups.to_csv("new_new_pre_test_groups.tsv", sep="\t", index = False)

pre_train_groups.head()