In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString
import numpy as np
from collections import defaultdict

ignore = {'head', 'iframe', 'script', 'meta', 'link', 'style', 'input', 'checkbox', 'button', 'noscript'}

In [3]:
dir = "./drive/MyDrive/boilerplate_rm"

def get_tag_repr(tags, tags_dict):
  result = np.zeros(len(tags_dict))
  
  for t, c in tags.items():
    try:
        idx = tags_dict.index(t)
        result += np.array([0 if i != idx else c for i in range(len(tags_dict))])
    except ValueError:
      continue

  return result

def leavs_handler(node, tags=[], is_content=False):
    new_tags = tags + [node.name]
    if node.has_attr('__label'):
        is_content = node['__label']

    result = []
    for c in node.children:
        if isinstance(c, NavigableString):
            # might be just whitespace
            if c.string is not None and c.string.strip():
                result.append((c, new_tags, is_content))
        elif c.name is not None:
            if c.name.lower() in ignore:
                c.extract()
            else:
                result.extend(leavs_handler(c, new_tags, is_content))

    return result

data = []
tags_list = defaultdict(int)
for i in range(800):
    try:
        page = []
        with open(dir + f"/data/raw_html/{i}.html", "r", encoding='cp850') as file:
            content = file.read()
            soup = bs(content, features='html5lib')

        root = soup.find_all('html')[0]

        for node, tags, is_content, in leavs_handler(root):
            tags_dict = defaultdict(int)

            for tag in tags:
                tags_dict[tag] += 1

            if is_content:
                page.append([dict(tags_dict), node.string, 1])
            else:
                page.append([dict(tags_dict), node.string, 0])

            for tag, count in tags_dict.items():
                tags_list[tag] += count
          
        data.append(page)
        
    except FileNotFoundError:
        continue

ttt = sorted(tags_list.keys(), key=tags_list.get, reverse=True)[:50]

for p in data:
  for l in p:
    l[0] = get_tag_repr(l[0], ttt)


In [None]:
!pip install pytorch-transformers

In [6]:
import torch
from pytorch_transformers import DistilBertModel, DistilBertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
pretrained_weights = 'distilbert-base-uncased'
tokinaizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
model = DistilBertModel.from_pretrained(pretrained_weights)
model.cuda()

In [None]:
ids = []
labels = []
for p in data:
  for l in p:
    input_ids = tokinaizer.encode(l[1], add_special_tokens=True)
    ids.append(input_ids)
    labels.append(l[2])

ids = pad_sequences(
    ids,
    maxlen=128,
    dtype="long",
    truncating="post",
    padding="post"
    )
ids = torch.tensor(np.array(ids))
labels = torch.tensor(np.array(labels))

In [9]:
data_for_bert = TensorDataset(ids, labels)

dataloader = DataLoader(data_for_bert, sampler=SequentialSampler(data_for_bert), batch_size=32)

In [10]:
%%time
result = []
model = model.to(device)
for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_labels = batch
    
    with torch.no_grad():
        last_hidden_states = model(b_input_ids)

    features = last_hidden_states[0][:,0,:].detach().cpu().numpy()
    result.extend(features)



CPU times: user 10min 49s, sys: 426 ms, total: 10min 49s
Wall time: 10min 49s


In [11]:
i = 0
for p in data:
  for idx, l in enumerate(p):
    l[1] = result[i]
    l.append((idx+1)/len(p))
    i += 1

In [12]:
for p in data:
  for l in p:
    pos_v = []
    for i in range(1,385):
      pos_v.extend([np.sin(l[3]/(100**(2*i/768))), np.cos(l[3]/(100**(2*i/768)))])
    l[1] += np.array(pos_v)

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

dataset = []
dataset_labels = []

for p in data:
  for l in p:
    dataset.append(np.hstack((np.array(l[0]), l[1])))
    dataset_labels.append(l[2])

train_features, test_features, train_labels, test_labels = train_test_split(dataset, dataset_labels, test_size=0.33, random_state=111)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='sag', max_iter=1000)
lr_clf.fit(train_features, train_labels)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score


def report(labels, preds):
    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average="binary")
    f1 = f1_score(labels, preds, average="binary")
    precision = precision_score(labels, preds, average="binary")
    return {"accuracy": accuracy, "recall": recall, "f1": f1, "precision": precision}


lr_preds = lr_clf.predict(test_features)
lr_result = report(test_labels, lr_preds)

print('lr: ', lr_result)