In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import string
from os.path import join 
import os
import pickle


urls = {'Bowt':'https://snob.ru/entry/185057/',
    'Mlshtn':'https://snob.ru/entry/185060/',
    'Kuval':'https://snob.ru/entry/185010/',
    'Dvdv':'https://snob.ru/entry/184951/',
    'Prav':'https://snob.ru/entry/184852/',
    'Mrz':'https://snob.ru/news/184870/',
    'Znam':'https://snob.ru/entry/184780/',
    'Mikol':'https://snob.ru/entry/184642/',
    'Inoz':'https://snob.ru/entry/184764/',
    'Mashk':'https://snob.ru/entry/184611/'
    }



full_names =    ['Георгий Бовт',
                'Илья Мильштейн',
                'Станислав Кувалдин',
                'Иван Давыдов',
                'Ксения Праведная',
                'Ольга Морозова',
                'Анна Знаменская',
                'Дарья Миколайчук',
                'Владислав Иноземцев',
                'Диана Машкова']

DATA = 'transcripts'


In [2]:
if not os.path.exists(DATA):
    os.makedirs(DATA)
    print("Directory " , DATA ,  " created ")
else:    
    print("Directory " , DATA ,  " already exists") 

Directory  transcripts  already exists


In [3]:
# Scrape transcript data
def url_to_text(url):
    '''Returns transcript data from snob.ru.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="text entry__text js-mediator-article").find_all('p')]
    print(url+" --- DONE")
    return text




data = pd.DataFrame(index=urls.keys(), columns=['transcripts'])

for name, url in urls.items():
    data.loc[name]['transcripts'] = url_to_text(url)


data['text'] = [' '.join(rw['transcripts']) for ind,rw in data.iterrows()]  
data['full_name'] = full_names                                                                    
data.to_pickle(join(DATA,"transcripts.pkl"))

https://snob.ru/entry/185057/ --- DONE
https://snob.ru/entry/185060/ --- DONE
https://snob.ru/entry/185010/ --- DONE
https://snob.ru/entry/184951/ --- DONE
https://snob.ru/entry/184852/ --- DONE
https://snob.ru/news/184870/ --- DONE
https://snob.ru/entry/184780/ --- DONE
https://snob.ru/entry/184642/ --- DONE
https://snob.ru/entry/184764/ --- DONE
https://snob.ru/entry/184611/ --- DONE


In [4]:
## Clean data
def clean_round(text):
    '''Clean first'''
    text = text.lower()
    text = text.replace('ё','е')
    text = text.replace('Ё','Е')
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("—", "", text)
    text = re.sub("/", " ", text)
    text = re.sub('[‘’“”…»«]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub("[a-z]+","", text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub(' +', ' ', text) 
    return text   


cln = lambda x: clean_round(x)

# 1 -- Clean
data_clean = pd.DataFrame(data.text.apply(cln))

In [5]:
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation


In [6]:
#Create lemmatizer and stopwords list
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

prepro = lambda x: preprocess_text(x)


In [7]:
# 2 -- Lemmatization
data_clean = pd.DataFrame(data_clean.text.apply(prepro))

data_clean['full_name'] = full_names
data_clean.to_pickle(join(DATA,"clean.pkl"))


In [8]:
## Document-Term Matrix
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(stop_words=russian_stopwords)
data_cv = cv.fit_transform(data_clean.text)

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())

data_dtm.index = data_clean.index

In [9]:
# Save data 
data_dtm.to_pickle(join(DATA,"dtm.pkl"))
pickle.dump(cv, open(join(DATA,"cv.pkl"), "wb"))