In [10]:
from bs4 import BeautifulSoup
import requests
import codecs
from lxml import etree, html as lhtml
import numpy as np
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
import os
import string
import pymorphy2

In [2]:
%%time
def get_page_text(filename):
    with codecs.open(path + filename, 'r', 'utf-8') as f:
        url = f.readline().strip()
        soup = BeautifulSoup(f, 'lxml')
        title = soup.title.text
        text = soup.get_text()
        return text

        
def get_title(filename):
    with codecs.open(path + filename, 'r', 'utf-8') as f:
        url = f.readline().strip()
        soup = BeautifulSoup(f, 'lxml')
        title = soup.title.text
        return title

    
def get_texts_and_titles_one_thread(filenames):
    texts, titles = [], []
    for i in range(1, n + 1):
        filename = str(i) + '.dat'
        try:
            with codecs.open(path + filename, 'r', 'utf-8') as f:
                url = f.readline().strip()
                soup = BeautifulSoup(f, 'lxml')
                title = soup.title.text
                text = soup.get_text()
                texts.append(text)
                titles.append(title)
        except Exception:
            continue
    return texts, titles


path = 'content/'
filenames = os.listdir(path)
titles = []
texts = []
n = 1000
# n - число файлов, которые будут прочитаны: 1.dat, 2.dat, ..., n.dat
# n_max = 28026

with ThreadPool(10) as pool:
    texts = list(pool.map(get_page_text, filenames[0:1001]))
pool.join()

with ThreadPool(10) as pool:
    titles = list(pool.map(get_title, filenames[0:1001]))
pool.join()

CPU times: user 11min 29s, sys: 5min 7s, total: 16min 36s
Wall time: 11min 40s


In [3]:
%%time
def normalize(text):
    punc = string.punctuation
    punc += '/n/a/b/f/r/t/v'
    morph = pymorphy2.MorphAnalyzer()
    text = ''.join([o if not o in punc else ' ' for o in text])
    text = list(morph.parse(word)[0].normal_form for word in text.split())
    return text


with ThreadPool(10) as pool:
    texts = list(pool.map(normalize, texts))
pool.join()

with ThreadPool(10) as pool:
    titles = list(pool.map(normalize, titles))
pool.join()

CPU times: user 1h 51min 49s, sys: 1min 25s, total: 1h 53min 15s
Wall time: 1h 55min 51s


In [4]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

def choose_min_max_df(texts_w_norm, titles_w_norm):
    '''
    choose min_df & max_df parameters for CountVectorizer, TfIdfVectorizer.
    Isn't finished yet.
    '''
    
    min_df_arr = [0.01, 0.02, 0.03, 0.04, 0.05, 0.075, 0.1]
    max_df_arr = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for min_df in min_df_arr:
        for max_df in max_df_arr:
            texts = [' '.join(text) for text in texts_w_norm]
            vectorizer = CountVectorizer(min_df=min_df, max_df=max_df)
            W = vectorizer.fit_transform(texts)
            words = vectorizer.get_feature_names()

            title_cntvect = CountVectorizer()
            titles = [' '.join(title) for title in titles_w_norm]
            title_W = title_cntvect.fit_transform(titles)
            title_words = title_cntvect.get_feature_names()

            print('min_df =', min_df, 'max_df =', max_df, 
                  'words_shape =', len(words), 
                  'title_words_shape =', len(title_words))
            # print(title_words)
            # print('\n\n', words, '\n\n')
            
            
min_df, max_df = 0.002, 0.5
texts = [' '.join(text) for text in texts]
vectorizer = CountVectorizer(min_df=min_df, max_df=max_df)
W = vectorizer.fit_transform(texts)
words = vectorizer.get_feature_names()

title_cntvect = CountVectorizer(min_df=min_df, max_df=max_df)
titles = [' '.join(title) for title in titles]
title_W = title_cntvect.fit_transform(titles)
title_words = title_cntvect.get_feature_names()

print('min_df =', min_df, 'max_df =', max_df, 
      'words_shape =', len(words), 
      'title_words_shape =', len(title_words))
            # print(title_words)
            # print('\n\n', words, '\n\n')

min_df = 0.03 max_df = 0.5 words_shape = 13165 title_words_shape = 3118
CPU times: user 26 s, sys: 4.11 s, total: 30.1 s
Wall time: 35.6 s


In [5]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=min_df, max_df=max_df)

texts_W_tfidf = tfidf.fit_transform(texts)
titles_W_tfidf = tfidf.fit_transform(titles)
texts_W_mat = texts_W_tfidf.todense()
titles_W_mat = titles_W_tfidf.todense()

texts_W_vocab = tfidf.get_feature_names()
titles_W_vocab = tfidf.get_feature_names()

CPU times: user 23 s, sys: 312 ms, total: 23.4 s
Wall time: 23.7 s


In [14]:
from scipy.spatial.distance import pdist, squareform, cdist

def cos_dist(X):
    return squareform(pdist(X, metric='cosine'))

In [15]:
%%time
titles_cdist = cos_dist(titles_W_mat)

CPU times: user 1.56 s, sys: 31.2 ms, total: 1.59 s
Wall time: 1.69 s


In [7]:
%%time
texts_cdist = cos_dist(texts_W_mat)
texts_cdist

CPU times: user 9.64 s, sys: 0 ns, total: 9.64 s
Wall time: 9.78 s


array([[0.        , 0.9861884 , 0.99633716, ..., 0.98831151, 0.98478626,
        0.98684992],
       [0.9861884 , 0.        , 0.89828601, ..., 0.91741135, 0.92501084,
        0.96455643],
       [0.99633716, 0.89828601, 0.        , ..., 0.93168234, 0.98316882,
        0.98483372],
       ...,
       [0.98831151, 0.91741135, 0.93168234, ..., 0.        , 0.95407295,
        0.89713327],
       [0.98478626, 0.92501084, 0.98316882, ..., 0.95407295, 0.        ,
        0.98311164],
       [0.98684992, 0.96455643, 0.98483372, ..., 0.89713327, 0.98311164,
        0.        ]])

In [12]:
def texts_to_csv(texts_cdist, texts_W_mat):
    pd.DataFrame(texts_cdist).to_csv('texts_cdist.csv')
    pd.DataFrame(texts_W_mat).to_csv('texts_tfidf.csv')
    

def titles_to_csv(titles_cdist, titles_W_mat):
    pd.DataFrame(titles_cdist).to_csv('titles_cdist.csv')
    pd.DataFrame(titles_W_mat).to_csv('titles_tfidf.csv')
    

def get_from_csv(string):
    if string == 'titles':
        np_tfidf = np.array(pd.read_csv('titles_tfidf.csv'))
        np_cdist = np.array(pd.read_csv('titles_cdist.csv'))
    elif string == 'texts':
        np_tfidf = np.array(pd.read_csv('titles_tfidf.csv'))
        np_cdist = np.array(pd.read_csv('titles_cdist.csv'))
    return np_tfidf, np_cdist


test_title_idf, test_title_cdist = get_from_csv('titles')

In [13]:
test_title_idf


array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [9.98000000e+02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 5.14251619e-01],
       [9.99000000e+02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])