In [143]:
import collections
import glob
import re

In [144]:
class SourceCode:
    def __init__(self, source_code, comment_prefix='//'):
        self.lines = []
        self.comment_prefix = comment_prefix
        self.format_lines(source_code)

    def format_lines(self, lines):
        number = 0
        for line in lines:
            if self.check_comment(line):
                continue
            number += 1
            code = {}
            code['number'] = number
            code['indent'] = self.count_spaces(line, left=True)

            cleaned = self.clean_code(line)
            code['code'] = cleaned

            chars = cleaned.replace(' ', '')
            code['letters'] = self.parse_chars(chars, '[^a-zA-Z]+')
            code['marks'] = self.parse_chars(chars, '[a-zA-Z]+')

            self.lines.append(code)

    def check_comment(self, code):
        strip = code.lstrip()
        return strip.find(self.comment_prefix) == 0
    
    def count_spaces(self, code, left=False, right=False):
        total = code.count(' ')
        strip = ''
        if left:
            strip = code.lstrip()
        if right:
            strip = code.rstrip()
        return total - strip.count(' ')
    
    def clean_code(self, code):
        return code.strip('\n')
    
    def parse_chars(self, code, pattern):
        chars = re.sub(pattern, '', code)
        return [char for char in chars]
    
    def get_frequencies(self):
        letters = []
        marks = []
        for line in self.lines:
            letters += line['letters']
            marks += line['marks']
        return collections.Counter(letters), collections.Counter(marks)

In [163]:
def frequency_percentages(counter):
    total = sum([counter[k] for k in counter])
    return [(k, round((counter[k] / total) * 100.0, 2)) for k in counter]

In [165]:
most_common_rust_letters = collections.Counter()
most_common_rust_marks = collections.Counter()

files = glob.glob('**/*.rs', recursive=True)

for filename in files:
    with open(filename, mode='rt', encoding="utf-8") as file:
        # print(file.name)
        lines = file.readlines()
        source_code = SourceCode(lines)
        frequencies = source_code.get_frequencies()
        most_common_rust_letters += frequencies[0]
        most_common_rust_marks += frequencies[1]

print('Rust')
print('\nFile count:', len(files))

print('\nMost common Rust letters')
print(frequency_percentages(most_common_rust_letters))
print('\nMost common Rust marks')
print(frequency_percentages(most_common_rust_marks))

Rust

File count: 169

Most common Rust letters
[('e', 11.96), ('x', 2.24), ('t', 5.15), ('r', 3.28), ('n', 2.54), ('c', 7.66), ('a', 8.78), ('g', 0.65), ('u', 2.61), ('s', 3.78), ('d', 7.26), ('v', 0.36), ('p', 1.77), ('h', 0.87), ('P', 0.16), ('f', 7.73), ('m', 1.28), ('i', 2.56), ('l', 2.44), ('T', 3.18), ('A', 3.46), ('R', 0.15), ('G', 2.63), ('E', 0.4), ('w', 0.55), ('o', 2.15), ('H', 0.03), ('O', 0.1), ('S', 0.33), ('C', 3.46), ('X', 0.04), ('N', 0.11), ('b', 7.0), ('y', 0.69), ('k', 0.39), ('B', 0.42), ('W', 0.03), ('V', 0.08), ('K', 0.06), ('z', 0.22), ('U', 0.02), ('D', 0.34), ('I', 0.16), ('M', 0.13), ('L', 0.09), ('q', 0.2), ('F', 0.44), ('j', 0.06), ('Y', 0.02), ('J', 0.0), ('Z', 0.01), ('Q', 0.01)]

Most common Rust marks
[(';', 1.51), (':', 1.79), ('(', 1.99), (')', 1.99), ('{', 0.77), ('=', 1.12), ('"', 0.86), ('.', 1.5), ('&', 0.54), ('/', 0.07), ('_', 2.0), ('8', 7.35), ('6', 7.23), ('4', 7.29), (',', 4.58), ('}', 0.77), ('[', 0.74), (']', 0.74), ('-', 0.9), ('>', 0.54