In [1]:
import collections
import glob
import re
from IPython.display import display, clear_output

In [2]:
class SourceCode:
    def __init__(self, source_code, comment_prefix='//'):
        self.lines = []
        self.comment_prefix = comment_prefix
        self.format_lines(source_code)

    def format_lines(self, lines):
        number = 0
        for line in lines:
            if self.check_comment(line):
                continue
            number += 1
            code = {}
            code['number'] = number
            code['indent'] = self.count_spaces(line, left=True)

            cleaned = self.clean_code(line)
            code['code'] = cleaned
            code['words'] = self.parse_words(cleaned)
            chars = cleaned.replace(' ', '')
            code['letters'] = self.parse_chars(chars, '[^a-zA-Z]+')
            code['marks'] = self.parse_chars(chars, '[a-zA-Z]+')

            self.lines.append(code)

    def check_comment(self, code):
        strip = code.lstrip()
        return strip.find(self.comment_prefix) == 0
    
    def count_spaces(self, code, left=False, right=False):
        total = code.count(' ')
        strip = ''
        if left:
            strip = code.lstrip()
        if right:
            strip = code.rstrip()
        return total - strip.count(' ')
    
    def clean_code(self, code):
        return code.strip('\n')
    
    def parse_words(self, code):
        words = re.sub('[^a-zA-Z]+', ' ', code).split(' ')
        return [w for w in words if len(w) > 1]
    
    def parse_chars(self, code, pattern):
        chars = re.sub(pattern, '', code)
        return [char for char in chars]
    
    def get_frequencies(self):
        words = []
        letters = []
        marks = []
        for line in self.lines:
            words += line['words']
            letters += line['letters']
            marks += line['marks']
        return collections.Counter(words), collections.Counter(letters), collections.Counter(marks)

In [3]:
def frequency_percentages(counts):
    total = sum([k[1] for k in counts])
    return [(k[0], round((k[1] / total) * 100.0, 2)) for k in counts]

In [149]:
class ProgressBar:
    def __init__(self, total, group=1):
        self.total = total
        self.group = group
        self.count = 1
        self.bar = ' ' * (total//group) + '|'
        self.zero = self._build_zero()
    
    def __str__(self):
        if self.count < self.total:
            count = self._update()
        else:
            count = self.total
        return self.bar + str(int((count/self.total)*100)) + '%'
    
    def _build_zero(self):
        bar = self.bar
        class Zero:
            def __str__(self):
                return bar + '0%'
        return Zero()
    
    def _update(self):
        current_count = self.count
        if (current_count == 1) or (current_count % self.group == 0):
            self.bar = self.bar[:len(self.bar)-2]
            self.bar = '░' + self.bar + '|'
            self.count += 1
            return self.count
        self.count += 1
        return current_count
    
    def reset(self):
        self.count = 1

In [None]:
import time

most_common_rust_words = collections.Counter()
most_common_rust_letters = collections.Counter()
most_common_rust_marks = collections.Counter()

files = glob.glob('**/*.rs', recursive=True)

print('Rust')
print('\nFile count:', len(files))

progress = ProgressBar(len(files), group=3)
c = 0

for filename in files:
    with open(filename, mode='rt', encoding="utf-8") as file:
        c += 1
        lines = file.readlines()
        source_code = SourceCode(lines)
        frequencies = source_code.get_frequencies()
        most_common_rust_words += frequencies[0]
        most_common_rust_letters += frequencies[1]
        most_common_rust_marks += frequencies[2]
        
        clear_output(wait=True)
        print('Files: ' + str(c) + ' of ' + str(len(files)))
        print(progress)
        print('\nMost common Rust words')
        print(frequency_percentages(most_common_rust_words.most_common(100)))
        print('\nMost common Rust letters')
        print(frequency_percentages(most_common_rust_letters.most_common()))
        print('\nMost common Rust marks')
        print(frequency_percentages(most_common_rust_marks.most_common()))
        time.sleep(0.2)

Files: 219 of 507
░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                                                                                               |43%

Most common Rust words
[('self', 5.04), ('fn', 4.85), ('let', 3.91), ('assert', 3.19), ('pub', 2.9), ('use', 2.83), ('mut', 2.51), ('str', 2.2), ('uri', 1.84), ('rocket', 1.67), ('name', 1.64), ('string', 1.56), ('for', 1.51), ('eq', 1.51), ('test', 1.45), ('impl', 1.39), ('from', 1.36), ('inline', 1.36), ('Some', 1.35), ('to', 1.32), ('new', 1.27), ('crate', 1.27), ('path', 1.24), ('into', 1.21), ('is', 1.19), ('parse', 1.17), ('route', 1.13), ('response', 1.11), ('if', 1.09), ('get', 1.04), ('Result', 1.04), ('match', 1.01), ('type', 1.0), ('as', 0.95), ('None', 0.87), ('always', 0.87), ('request', 0.86), ('config', 0.81), ('macro', 0.81), ('Ok', 0.81), ('std', 0.8), ('fmt', 0.77), ('static', 0.77), ('req', 0.77), ('value', 0.74), ('key', 0.73), ('data', 0.73), ('Self', 0.73), ('Option', 0.72),

In [84]:
import collections
import re

chars = ''
most_common_words = []

with open('./data/google-books-common-words.txt', mode='rt', encoding='utf-8') as file:
    lines = file.readlines()
    count = 0
    while count < 100:
        line = lines[count]
        line = line.replace('\n', '')
        line = line.replace('\t', '')
        line = line.replace(' ', '')
        occurrences = re.sub('[A-z]+', '', line)
        line = re.sub('\d+', '', line)
        most_common_words.append((line, int(occurrences)))
        chars += line
        count += 1

print('\nMost common book words')
print(frequency_percentages(most_common_words))

print('\nMost common book letters')
print(frequency_percentages(collections.Counter([char for char in chars]).most_common()))


Most common book words
[('THE', 15.25), ('OF', 8.89), ('AND', 6.5), ('TO', 5.56), ('IN', 4.85), ('A', 4.4), ('IS', 2.41), ('THAT', 2.3), ('FOR', 1.88), ('IT', 1.65), ('AS', 1.64), ('WAS', 1.58), ('WITH', 1.49), ('BE', 1.38), ('BY', 1.35), ('ON', 1.32), ('NOT', 1.3), ('HE', 1.18), ('I', 1.12), ('THIS', 1.1), ('ARE', 1.06), ('OR', 1.05), ('HIS', 1.04), ('FROM', 1.0), ('AT', 0.98), ('WHICH', 0.9), ('BUT', 0.8), ('HAVE', 0.8), ('AN', 0.78), ('HAD', 0.75), ('THEY', 0.71), ('YOU', 0.67), ('WERE', 0.65), ('THEIR', 0.62), ('ONE', 0.62), ('ALL', 0.59), ('WE', 0.59), ('CAN', 0.48), ('HER', 0.47), ('HAS', 0.47), ('THERE', 0.47), ('BEEN', 0.46), ('IF', 0.45), ('MORE', 0.44), ('WHEN', 0.44), ('WILL', 0.43), ('WOULD', 0.42), ('WHO', 0.42), ('SO', 0.42), ('NO', 0.4), ('SHE', 0.4), ('OTHER', 0.4), ('ITS', 0.39), ('MAY', 0.38), ('THESE', 0.37), ('WHAT', 0.35), ('THEM', 0.34), ('THAN', 0.34), ('SOME', 0.34), ('HIM', 0.34), ('TIME', 0.34), ('INTO', 0.33), ('ONLY', 0.33), ('DO', 0.32), ('SUCH', 0.32), ('