# Counting consonant clusters

To explore possible consonant clusters in Russian, tabulate those in *War and Peace*.

## Import packages

In [86]:
import re
from collections import defaultdict

## Constants

In [87]:
cs = 'бвгджзйклмнпрстфхцчшщ' # consonants
ccpat = re.compile("[" + cs + "]{2,}") # two or more consecutive consonants

## Data

In [88]:
with open("voina-i-mir.txt") as f: # read in all of War and Peace
    text = f.read()
text = re.sub(r"\s+", " ", text.lower()) # whitespace and case normalize

## Count occurrences of each type of consonant cluster

In [89]:
ccs = ccpat.findall(text) # all consonant clusters (292016)
ccdict = defaultdict(int)
for i in ccs:
    ccdict[i] = ccdict[i] + 1
print("There are", len(ccdict), "unique clusters in the text")

There are 616 unique clusters in the text


## Sort and print results

In [90]:
sorted_ccdict = sorted(ccdict.items(), key = lambda x: len(x[0]), reverse=True)
sorted_ccdict[:10]
for i in reversed(range(1, max([len(str(i)) for i in sorted_ccdict[0]]) + 1)):
    hits = [item[0] for item in sorted_ccdict if len(item[0]) == i]
    print("\nThere are", len(hits), "unique consonant clusters of", i, "letters:", end=" ")
    print([sorted(hits, key=lambda x: x[0])][0])


There are 3 unique consonant clusters of 5 letters: ['дрств', 'нтств', 'рптск']

There are 60 unique consonant clusters of 4 letters: ['бств', 'бстр', 'вств', 'встр', 'взгл', 'вскр', 'вспр', 'вспл', 'вздр', 'всхл', 'взбр', 'врбн', 'вздв', 'врск', 'всхр', 'гств', 'дств', 'дстр', 'дтск', 'жств', 'здрж', 'йств', 'ймск', 'йнск', 'йхшт', 'ккск', 'мств', 'нстр', 'нств', 'нтск', 'ндшт', 'ндск', 'нстк', 'нтрб', 'нскр', 'рств', 'ргск', 'рнск', 'рстн', 'рррр', 'рдск', 'рщвл', 'рстг', 'рбск', 'рмск', 'ртсг', 'ствл', 'сстр', 'сств', 'сспр', 'сткл', 'стск', 'стпл', 'тств', 'тмст', 'тстр', 'тпрр', 'фств', 'шпрш', 'шшшш']

There are 248 unique consonant clusters of 3 letters: ['бст', 'бшл', 'бхв', 'бск', 'бгл', 'вск', 'вдр', 'всп', 'вст', 'всл', 'втр', 'взд', 'взм', 'взл', 'впр', 'взр', 'взв', 'впл', 'вкн', 'взб', 'вгл', 'вдв', 'вбл', 'взн', 'всх', 'взг', 'всм', 'вкл', 'вкр', 'гср', 'гсб', 'гск', 'дст', 'дхв', 'дск', 'дтв', 'дпр', 'дпл', 'дкр', 'дтр', 'двк', 'дсв', 'дсм', 'дгр', 'дсл', 'дзн', 'дкл',