In [1]:
import json, string, ufal.morphodita
from collections import Counter
from wordcloud import WordCloud

morpho = ufal.morphodita.Morpho.load("czech-morfflex-161115.dict")

with open("data.json", encoding="utf8") as file:
    data = json.load(file)

In [2]:
sixties = [entry for entry in data if int(entry["year"]) > 1960 and int(entry["year"]) < 1970]
seventies = [entry for entry in data if int(entry["year"]) >= 1970 and int(entry["year"]) < 1980]
eighties = [entry for entry in data if int(entry["year"]) >= 1980 and int(entry["year"]) < 1990]
nineties = [entry for entry in data if int(entry["year"]) >= 1990 and int(entry["year"]) < 2000]
oughts = [entry for entry in data if int(entry["year"]) >= 2000 and int(entry["year"]) < 2010]
tens = [entry for entry in data if int(entry["year"]) >= 2010]

In [3]:
def lemmatize(word):
    lemmas = ufal.morphodita.TaggedLemmas()
    morpho.analyze(word, morpho.GUESSER, lemmas)
    lemma = lemmas[0].lemma.split("`")[0].split("_")[0].split("-")[0]
    if lemma == "říci": return "říkat"
    if lemma == "žnout": return "žít"
    if lemma == "ránu": return "ráno"
    return lemma

In [4]:
def lemmatized_words(data):
    split_words = ''.join(entry["text"] for entry in data).translate(str.maketrans('', '', string.punctuation)).lower().split()
    return [lemmatize(word) for word in split_words]

In [5]:
def wordcloud(words):
    # ["Slovo", "být", "v", "a", "sebe", "na", "ten", "s", "z", "že", "který", "o", "mít", "i", "do", "on", "k", "pro", "tento", "za", "by", "moci", "svůj", "ale", "po", "rok", "jako", "však", "od", "všechen", "dva", "nebo", "tak", "u", "při", "jeden", "podle", "Praha", "jen", "další", "jeho", "aby", "co", "český", "jak", "veliký", "nový", "až", "už", "muset", "než", "nebýt", "člověk", "jenž", "léto", "firma", "první", "náš", "také", "my", "jejich", "když", "před", "doba", "chtít", "jiný", "mezi", "ještě", "já", "ani", "cena", "již", "jít", "strana", "či", "druhý"]
    stopwords = ["být", "v", "a", "sebe", "na", "ten", "s", "se", "Company", "taka", "le", "z", "že", "který", "o", "mít", "i", "do", "on", "k", "pro", "už", "za", "jak", "ala", "jen", "až", "po", "pak"]
    wordcloud = WordCloud(stopwords=stopwords,width=800,height=400,normalize_plurals=False,collocations=False,background_color="white").generate(" ".join(words))
    return wordcloud.to_image()

In [69]:
lemmas = ufal.morphodita.TaggedLemmas()
morpho.analyze("ránu", morpho.GUESSER, lemmas)
for lemma in lemmas: print(lemma.lemma)

rána
ráno-1
ráno-1


In [68]:
wordcloud(lemmatized_words(sixties)).save("wc_60.png")
wordcloud(lemmatized_words(seventies)).save("wc_70.png")
wordcloud(lemmatized_words(eighties)).save("wc_80.png")
wordcloud(lemmatized_words(nineties)).save("wc_90.png")
wordcloud(lemmatized_words(oughts)).save("wc_00.png")
wordcloud(lemmatized_words(tens)).save("wc_10.png")
wordcloud(lemmatized_words(data)).save("wc_total.png")

In [8]:
split_words = ''.join(entry["text"] for entry in data).translate(str.maketrans('', '', string.punctuation)).lower().split()
stopwords = ["být", "v", "a", "sebe", "na", "ten", "s", "se", "Company", "taka", "le", "z", "že", "který", "o", "mít", "i", "do", "on", "k", "pro", "už", "za", "jak", "ala", "jen", "až", "po", "pak"]
print(len(split_words))
print(Counter([lemmatize(word) for word in split_words if lemmatize(word) not in stopwords]).most_common())

54085
[('já', 1969), ('ty', 638), ('láska', 502), ('tvůj', 497), ('můj', 475), ('daleko', 429), ('svůj', 418), ('jít', 406), ('rád', 393), ('když', 373), ('chtít', 340), ('dát', 324), ('vědět', 321), ('sám', 287), ('znát', 283), ('říkat', 239), ('den', 238), ('kdo', 208), ('stát', 205), ('sen', 199), ('hodně', 198), ('všechen', 197), ('r', 196), ('teď', 188), ('čas', 181), ('spát', 179), ('svět', 170), ('žít', 170), ('píseň', 169), ('tam', 157), ('noc', 155), ('moci', 153), ('proč', 150), ('hrát', 148), ('ať', 134), ('jako', 130), ('zpívat', 129), ('chvíle', 128), ('snad', 126), ('jeden', 124), ('kde', 120), ('krásný', 118), ('rána', 117), ('pár', 106), ('zůstat', 106), ('zas', 103), ('kam', 103), ('oko', 99), ('tvář', 94), ('než', 93), ('málo', 92), ('tma', 91), ('každý', 91), ('2', 90), ('zdát', 87), ('muset', 86), ('brzy', 85), ('stále', 84), ('jenom', 84), ('dnes', 82), ('dívka', 80), ('dva', 79), ('1', 79), ('srdce', 78), ('mnout', 78), ('znít', 75), ('smět', 75), ('žádný', 74), (