## Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import gzip
import os
import sys
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from multiprocessing.pool import Pool
import re
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix
from sklearn.preprocessing import Normalizer

## Загрузка файлов

In [3]:
def worker(file):
    try:
        with gzip.open('simwebhtml/%s'%file, 'rb') as f:
            page_source = f.read().decode('utf-8')
        soup = BeautifulSoup(page_source, 'html.parser')
        info = soup.find(id='demographics').text
        domain = file[:-3]
        good_file = 'wa-demographics__age-data-label' in page_source
        return (domain, soup.find_all('meta')[3].get('content', ''), info, good_file)
    except Exception as e:
        e = sys.exc_info()
        if isinstance(e[0], KeyboardInterrupt):
            return None
        return None


info = []
with Pool(8) as pool:
    files = os.listdir('simwebhtml')
    pbar = tqdm(total = len(files))
    for result in pool.imap(worker, files):
        if result is None: continue
        info.append(
            result
        )
        pbar.update(1)

  0%|          | 0/18845 [00:00<?, ?it/s]

## Получение категории и рейтинга сайта

In [4]:
regex = re.compile(
    r'^([\w\d-]*\.[\w\d-]*) is ranked #(\d+) in the ([\w\s\>-]+) category and #(\d+) Globally.*$'
)

meta_info = []
for x in tqdm(info):
    res = regex.findall(x[1].replace('&', 'and'))[0]
    assert len(res) == 4
    (domain, top_cat, cat, top_all) = res
    meta_info.append((domain, int(top_cat), cat.strip(), int(top_all)))

meta_info

  0%|          | 0/17923 [00:00<?, ?it/s]

[('turbopages.org', 1, 'News and Media Publishers', 34),
 ('apest.ru', 8122, 'Health', 2792966),
 ('med-tutorial.ru', 1925, 'Health > Health - Other', 737413),
 ('mister-office.ru',
  818,
  'Computers Electronics and Technology > Computers Electronics and Technology - Other',
  366845),
 ('animedub.ru', 58, 'Arts and Entertainment > Animation and Comics', 65687),
 ('vibir.ru',
  1047,
  'Computers Electronics and Technology > Telecommunications',
  630900),
 ('gp91.ru', 5023, 'Health > Health - Other', 2423137),
 ('4eva.ru', 3551, 'News and Media Publishers', 1487585),
 ('obzor.io', 9231, 'News and Media Publishers', 6174242),
 ('nntv.tv', 4528, 'News and Media Publishers', 2120224),
 ('raspisanie-autobusov.ru', 95, 'Reference Materials > Maps', 721972),
 ('anyamashka.ru',
  0,
  'Arts and Entertainment > Arts and Entertainment - Other',
  0),
 ('policeiskiisrublevki.ru',
  967,
  'Arts and Entertainment > Streaming and Online TV',
  484405),
 ('egida.by', 26, 'Pets and Animals > Pets

## Пример информации о сайте

In [4]:
info[2]

('med-tutorial.ru',
 'med-tutorial.ru is ranked #1925 in the Health > Health - Other category and #737413 Globally according to February 2023 data. Get the full med-tutorial.ru Analytics and market share drilldown here',
 "med-tutorial.ru Audience DemographicsAudience composition can reveal a site's current market share across various audiences. med-tutorial.ru's audience is 35.82% male and 64.18% female. The largest age group of visitors are 25 - 34 year olds (Desktop).  Age DistributionGender DistributionFemale64.18%Male35.82%",
 False)

## Выделение информации о поле

In [5]:
ismale = []
err_cnts = 0
for x in tqdm(info):
    try:
        ismale.append(float(x[2].split('audience is ')[-1].split('%')[0]))
    except Exception as e:
        err_cnts += 1
        ismale.append(50.)
err_cnts

  0%|          | 0/17923 [00:00<?, ?it/s]

2163

## Выделение информации о возрасте, мода

In [6]:
age_group = []
err_cnts = 0
for x in tqdm(info):
    try:
        age = str(x[2].split(
            'The largest age group of visitors are '
        )[-1].split(' year olds')[0]).replace(' ', '')
        assert len(age) < 10
        age_group.append(age)
    except Exception as e:
        err_cnts += 1
        age_group.append('25-34')
err_cnts

  0%|          | 0/17923 [00:00<?, ?it/s]

2163

## Выделение информации о возрасте, гистограма

In [7]:
age_probs = []
err_cnts = 0
for x in tqdm(info):
    try:
        probs =  list(map(float, 
            x[2].split('Highcharts 10.3.2')[-1].split('18 - 24')[0].split('%')[:-1]
                         ))
        assert len(probs) == 6
        age_probs.append(
           probs
        )
    except Exception as e:
        err_cnts += 1
        age_probs.append([0]*6)
age_probs = np.array(age_probs)
age_probs[age_probs[:, 0]==0] = age_probs[np.unique(age_probs.nonzero()[0])].mean(axis=0)
err_cnts

  0%|          | 0/17923 [00:00<?, ?it/s]

16651

## Векторизация категорий

In [8]:
topics = [x[2] for x in meta_info]

topics_cbag = CountVectorizer(min_df=10).fit_transform(pd.Series(topics))
topics_cbag

<17923x187 sparse matrix of type '<class 'numpy.int64'>'
	with 72707 stored elements in Compressed Sparse Row format>

## Векторизация возраста и пола

In [9]:
with gzip.open('files/url_host_cbag_v2.pickle.gz', 'rb') as f:
    datamap = pickle.load(f)
datamap

{'data': <415317x199684 sparse matrix of type '<class 'numpy.uint32'>'
 	with 32277669 stored elements in Compressed Sparse Row format>,
 'uids': array([     4,     16,     18, ..., 415276, 415288, 415293])}

In [13]:
map_df = pd.read_csv('auxilary/url_host_mapper_v2.tsv.gz', sep='\t')

reindex = \
pd.DataFrame(
    dict(url_host=[x[0] for x in info])
        ).merge(map_df).drop_duplicates('url_host').url_host_idx.values.copy()

In [None]:
all_data = datamap['data'][:, reindex]
all_data_sqrt = csr_matrix((all_data.data**0.25, all_data.nonzero()),
                     shape=all_data.shape,
                     dtype=np.float32)

sex_score = TfidfTransformer(sublinear_tf=True).fit_transform(
        all_data_sqrt
).dot(np.array(ismale)-50.)
sex_score = sex_score/sex_score.std()/3.

In [16]:
topics_score = TfidfTransformer(sublinear_tf=True).fit_transform(
        all_data_sqrt
).dot(topics_cbag.toarray() - topics_cbag.toarray().mean(axis=0))
topics_score = Normalizer().fit_transform(topics_score)
topics_score = topics_score / topics_score.std(axis=0)
topics_score = np.clip(topics_score, -5, 5)
topics_score

array([[-1.11097803,  3.28295232, -0.41728786, ...,  0.87511175,
        -0.43044166, -0.50503614],
       [-0.86775558, -0.18287171, -0.32593252, ..., -0.56362885,
        -0.33620661, -0.39447038],
       [-0.28819117, -0.21084747, -0.37579375, ...,  0.1942539 ,
        -0.38763957, -0.45481655],
       ...,
       [-0.36784412, -0.07751985, -0.13816375, ..., -0.23892391,
        -0.14251896, -0.16721714],
       [-0.44771531, -0.09435199, -0.16816369, ..., -0.29080224,
        -0.17346457, -0.20352555],
       [-0.37209047, -0.07841473, -0.13975869, ..., -0.24168203,
        -0.14416418, -0.16914748]])

In [None]:
top_rank = \
    np.array([x[1] for x in meta_info]).argsort().argsort()/len(meta_info) - 0.5
ranking_score = TfidfTransformer(sublinear_tf=True).fit_transform(
        all_data_sqrt
).dot(np.array(top_rank))
ranking_score = ranking_score/ranking_score.std()/3.

In [None]:
all_rank = \
    np.array([x[3] for x in meta_info]).argsort().argsort()/len(meta_info) - 0.5
ranking_score2 = TfidfTransformer(sublinear_tf=True).fit_transform(
        all_data_sqrt
).dot(np.array(all_rank))
ranking_score2 = ranking_score2/ranking_score2.std()/3.

In [None]:
age_score = \
TfidfTransformer(sublinear_tf=True).fit_transform(
        all_data_sqrt
).dot(pd.get_dummies(age_group).values[:, :-1])
age_score = Normalizer().fit_transform(age_score)

In [20]:
age_score2 = \
TfidfTransformer(sublinear_tf=True).fit_transform(
        all_data_sqrt
).dot(age_probs - age_probs.mean(axis=0))
age_score2 = Normalizer().fit_transform(age_score2)
age_score2

array([[ 0.75102459,  0.29370035, -0.28601228, -0.47237416, -0.18653866,
        -0.09982549],
       [ 0.54642513,  0.46524543,  0.11195845, -0.32237975, -0.5554918 ,
        -0.24480959],
       [ 0.65947033,  0.45207254, -0.19339324, -0.40583681, -0.37330662,
        -0.13880564],
       ...,
       [ 0.51839175,  0.00990952, -0.70853066, -0.37718109,  0.29465338,
         0.00836116],
       [ 0.85686198,  0.09953513, -0.29246893, -0.30998074, -0.253182  ,
        -0.10076234],
       [ 0.51839175,  0.00990952, -0.70853066, -0.37718109,  0.29465338,
         0.00836116]])

## Конкатенация признаков SimWeb

In [21]:
simweb_scores = \
np.concatenate([sex_score[:, None],
                topics_score,
                ranking_score[:, None],
                ranking_score2[:, None],
                age_score,
                age_score2
                ], axis=1)

simweb_scores

array([[ 0.43295181, -1.11097803,  3.28295232, ..., -0.47237416,
        -0.18653866, -0.09982549],
       [ 0.83631993, -0.86775558, -0.18287171, ..., -0.32237975,
        -0.5554918 , -0.24480959],
       [ 1.05223869, -0.28819117, -0.21084747, ..., -0.40583681,
        -0.37330662, -0.13880564],
       ...,
       [ 0.13573071, -0.36784412, -0.07751985, ..., -0.37718109,
         0.29465338,  0.00836116],
       [ 0.26681284, -0.44771531, -0.09435199, ..., -0.30998074,
        -0.253182  , -0.10076234],
       [-0.01174745, -0.37209047, -0.07841473, ..., -0.37718109,
         0.29465338,  0.00836116]])

## Сохранение признаков

In [26]:
with gzip.open('auxilary/simweb_domain.pickle.gz', 'wb') as f:
    pickle.dump(simweb_scores, f, protocol=-1)