In [2]:
import requests
from bs4 import BeautifulSoup as bs
import re

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
headers = {'User-Agent': user_agent}
r = requests.get('https://meduza.io/rss/en/all', headers=headers)

soup = bs(r.text, 'xml')
links = [l.text for l in soup.findAll('link') if 'https://meduza.io/en/' in l.text]

In [8]:
def parse_meduza_page(link):
    r = requests.get(link, headers=headers)
    
    soup = bs(r.text, 'lxml')
    paragraphs = re.compile('^SimpleBlock-.*')
    titles = re.compile('.*Title-root$')
    
    pagetext = ' '.join([p.text for p in soup.findAll(True, paragraphs)])
    title = soup.find(True, titles).text
    timestamp = soup.find(True, 'Timestamp-root').text
    
    return ({
        'pagetext': pagetext,
        'title': title,
        'timestamp': timestamp,
        'url': r.url
    })

In [14]:
articles = [parse_meduza_page(l) for l in links] 

In [15]:
import pandas as pd

In [18]:
df = pd.DataFrame.from_records(articles)

In [19]:
df.to_csv('./meduza.csv')

In [129]:
df = pd.read_csv('./meduza.csv')
df = df.dropna()

In [130]:
from sklearn.feature_extraction.text import  TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,  stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['pagetext'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


In [131]:
from sklearn.decomposition import NMF
# nmf = NMF(n_components = 5, alpha=.1, l1_ratio=.5, init='nndsvd', random_state=42).fit(tfidf)
nmf = NMF(n_components = 7, random_state=84).fit(tfidf)

In [132]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
putin amendments vote changes constitution group constitutional kommersant reforms president
Topic 1:
cases moscow coronavirus china italy sobyanin countries new russia chinese
Topic 2:
turkish syrian russian syria erdogan conflict military city idlib troops
Topic 3:
news media service russian agency google tax fns federal company
Topic 4:
medical rospotrebnadzor case patient february 19 covid russia citizen infection
Topic 5:
year forum winter petersburg city group really st like annual
Topic 6:
meduza investigative case network committee treatment source drugs told ekaterina


In [143]:
# topics = ['Constitutional changes', 'COVID', 'Turkish-Russian Conflict', 'Taxes', 'COVID in Russia', 'Protests?', 
#          'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']
topics = ['Constitutional changes', 'COVID-19', 'Turkish-Russian Conflict', 'General?',
          'COVID-19 in Russia', 'St. Petersburg Forum / St. Petersburg', 'Investigations / Crime']
rates = pd.DataFrame(nmf.transform(tfidf), columns=topics)

In [144]:
rates.T.idxmax()

0                                  COVID-19
1                                  COVID-19
2                    Constitutional changes
3                                  COVID-19
4                                  COVID-19
5                  Turkish-Russian Conflict
6     St. Petersburg Forum / St. Petersburg
7                        COVID-19 in Russia
8                    Investigations / Crime
9                                  COVID-19
10                   Investigations / Crime
11                   Constitutional changes
12                                 COVID-19
13                                 General?
14                   Constitutional changes
15                                 General?
16                 Turkish-Russian Conflict
17    St. Petersburg Forum / St. Petersburg
18                       COVID-19 in Russia
19                   Constitutional changes
20    St. Petersburg Forum / St. Petersburg
21                   Constitutional changes
22                   Constitutio

In [145]:
df.title.dropna()

0     Why are there so few reported COVID-19 cases i...
1     Italian Embassy tells Italians not to travel t...
2     Putin prioritizes ‘stability’ over ‘alternatio...
3     Six new coronavirus cases confirmed in Russia....
4     Moscow institutes state of heightened prepared...
5         Russia and Turkey agree on ceasefire in Idlib
6     St. Petersburg International Economic Forum ca...
7           Seventh coronavirus case reported in Russia
8     Russian investigators order psychological exam...
10    On the border How the Russian region most inte...
11    Police reportedly discover remains of second m...
12    ‘Which god did Putin have in mind, exactly?’ I...
13    Moscow hospital set aside for coronavirus pati...
14    Russian media regulator accuses ‘BBC World New...
15    Russia's nationwide vote on Constitutional ref...
16    ‘Relax a bit behind bars’ Here’s a summary of ...
17    Russian military police enter Saraqib, Syrian ...
18    Feast your eyes on the craziest Maslenitsa

In [149]:
pd.DataFrame([df.title.values, rates.T.idxmax().values]).T

Unnamed: 0,0,1
0,Why are there so few reported COVID-19 cases i...,COVID-19
1,Italian Embassy tells Italians not to travel t...,COVID-19
2,Putin prioritizes ‘stability’ over ‘alternatio...,Constitutional changes
3,Six new coronavirus cases confirmed in Russia....,COVID-19
4,Moscow institutes state of heightened prepared...,COVID-19
5,Russia and Turkey agree on ceasefire in Idlib,Turkish-Russian Conflict
6,St. Petersburg International Economic Forum ca...,St. Petersburg Forum / St. Petersburg
7,Seventh coronavirus case reported in Russia,COVID-19 in Russia
8,Russian investigators order psychological exam...,Investigations / Crime
9,On the border How the Russian region most inte...,COVID-19


In [150]:
df.title.values

array(['Why are there so few reported COVID-19 cases in Russia?',
       'Italian Embassy tells Italians not to travel to Russia to avoid harsh Moscow quarantine measures',
       'Putin prioritizes ‘stability’ over ‘alternation of power,’ calls his position ‘fate,’ says he wants to avoid diarchy after 2024',
       "Six new coronavirus cases confirmed in Russia. They're all mild and linked to Italy.",
       'Moscow institutes state of heightened preparedness amid coronavirus spread',
       'Russia and Turkey agree on ceasefire in Idlib',
       'St. Petersburg International Economic Forum cancelled due to coronavirus outbreak',
       'Seventh coronavirus case reported in Russia',
       'Russian investigators order psychological examination for ‘Meduza’ journalist falsely charged with drug possession',
       'On the border How the Russian region most intertwined with China is coping economically amid the coronavirus outbreak',
       'Police reportedly discover remains of second m