# Vim Sensei

Make Vim discoverable!

In [1]:
from collections import Counter
from pprint import pprint

In [2]:
ascii_control_codes = {
    '\\x00': '^@',
    '\\x01': '^A',
    '\\x02': '^B',
    '\\x03': '^C',
    '\\x04': '^D',
    '\\x05': '^E',
    '\\x06': '^F',
    '\\x07': '^G',
    '\\x08': '^H',
    '\\x09': '^I',
    '\\x0a': '^J',
    '\\x0b': '^K',
    '\\x0c': '^L',
    '\\x0d': '^M',
    '\\x0e': '^N',
    '\\x0f': '^O',
    '\\x10': '^P',
    '\\x11': '^Q',
    '\\x12': '^R',
    '\\x13': '^S',
    '\\x14': '^T',
    '\\x15': '^U',
    '\\x16': '^V',
    '\\x17': '^W',
    '\\x18': '^X',
    '\\x19': '^Y',
    '\\x1a': '^Z',
    '\\x1b': '^[',
    '\\x1c': '^\\',
    '\\x1d': '^]',
    '\\x1e': '^^',
    '\\x1f': '^_',
    '\\x7f': '^?',
}

def human_readable(token):
    if token in ascii_control_codes:
        return ascii_control_codes[token]
    if token.startswith('\\x1b['):
        return token.replace('\\x1b[', '^')
    return token

In [3]:
frequencies_for_tokens = Counter()
frequencies_for_bigrams = Counter()
frequencies_for_trigrams = Counter()
previous_token = None
preprevious_token = None
with open('vim.log', 'r') as f:
    for line in f:
        token = line.replace('\n', '')
        token = human_readable(token)
        frequencies_for_tokens[token] += 1
        if previous_token is not None:
            frequencies_for_bigrams[previous_token+token] += 1
        if preprevious_token is not None:
            frequencies_for_trigrams[preprevious_token+previous_token+token] += 1
        preprevious_token = previous_token
        previous_token = token

In [4]:
pprint(frequencies_for_tokens.most_common(100))

[('j', 53763),
 ('k', 38341),
 ('w', 27019),
 ('n', 14119),
 (':', 13744),
 ('0', 11727),
 ('l', 11093),
 ('^O', 10630),
 ('^I', 10022),
 ('z', 10013),
 (' ', 9536),
 ('g', 8082),
 ('/', 7941),
 ('d', 7731),
 ('^D', 7480),
 ('i', 7157),
 ('b', 6528),
 ('^[', 6239),
 ('a', 5372),
 ('\\r', 5332),
 ('c', 4997),
 ('q', 4886),
 ('h', 4756),
 ('e', 4360),
 ('t', 4320),
 ('o', 4231),
 ('p', 3827),
 ('^W', 3709),
 ('f', 3642),
 ('u', 3486),
 ('_', 3205),
 ('r', 3001),
 ('v', 2733),
 (']', 2668),
 ('-', 2534),
 ('y', 2534),
 ('jj', 2348),
 ('*', 2267),
 ('^E', 2195),
 ('[', 2176),
 ('s', 2135),
 (',', 2131),
 ('V', 2098),
 ('.', 2051),
 ('?', 1825),
 ('%', 1794),
 ('^?', 1777),
 ('^U', 1603),
 ('^L', 1560),
 ('$', 1523),
 ('^H', 1506),
 ('kk', 1438),
 ("'", 1422),
 ('=', 1302),
 ('N', 1263),
 ('G', 1245),
 ('x', 1224),
 ('}', 1204),
 ('{', 1154),
 ('W', 1132),
 ('#', 1126),
 ('^T', 1046),
 ('A', 1015),
 (';', 880),
 (')', 858),
 ('\\t', 851),
 ('m', 747),
 ('^R', 651),
 ('^Q', 649),
 ('(', 628)

In [5]:
pprint(frequencies_for_bigrams.most_common(200))

[('jj', 31137),
 ('kk', 20962),
 ('^O^I', 9906),
 ('ww', 9170),
 ('nn', 7567),
 ('0w', 6985),
 ('^D^D', 5364),
 ('jk', 4649),
 ('zz', 3683),
 ('kj', 3608),
 ('^I^O', 3478),
 ('ll', 3047),
 ('jjj', 2968),
 ('jw', 2734),
 ('bb', 2655),
 ('gg', 2442),
 ('w:', 2396),
 ('_l', 2226),
 ('zt', 2157),
 ('wj', 2113),
 ('::', 1959),
 ('kkk', 1875),
 (':j', 1801),
 ('ld', 1755),
 ('^E^E', 1731),
 ('kw', 1714),
 ('/n', 1536),
 ('iw', 1518),
 ('hh', 1493),
 (' l', 1491),
 ('dd', 1464),
 (':^O', 1452),
 ('uu', 1322),
 ('wk', 1308),
 ('ci', 1263),
 ('wi', 1216),
 (' g', 1208),
 ('j0', 1206),
 ('0:', 1159),
 (':k', 1092),
 ('k0', 1017),
 ('^U^U', 987),
 ('jjjj', 932),
 ('*n', 921),
 ('^Ij', 900),
 ('ee', 874),
 ('j:', 859),
 ('^[k', 813),
 ('nz', 813),
 (':\\r', 805),
 ('//', 797),
 ('k:', 779),
 ('j\\r', 760),
 ('lr', 755),
 ('^I:', 744),
 ('^[j', 736),
 ('V^[', 735),
 ('w0', 717),
 ('gs', 714),
 ('lq', 711),
 (',j', 700),
 ('kd', 700),
 ('^[0', 692),
 ('dj', 689),
 ('^Wv', 679),
 ('Vj', 674),
 ('w*',

In [6]:
pprint(frequencies_for_trigrams.most_common(50))

[('jjj', 21131),
 ('kkk', 13712),
 ('nnn', 4986),
 ('www', 4910),
 ('^D^D^D', 3931),
 ('^I^O^I', 3461),
 ('^O^I^O', 3446),
 ('jjjjj', 2557),
 ('jjk', 1922),
 ('jjjj', 1802),
 ('0w:', 1786),
 ('jkk', 1770),
 ('lll', 1705),
 ('_ld', 1495),
 (':^O^I', 1443),
 ('kkkkk', 1403),
 ('kkj', 1343),
 ('kkkk', 1313),
 ('bbb', 1312),
 ('^E^E^E', 1300),
 ('kjj', 1284),
 ('jjw', 1085),
 ('wjj', 986),
 ('uuu', 977),
 (':jj', 957),
 ('jkj', 910),
 ('^O^Ij', 887),
 ('hhh', 813),
 ('j0w', 803),
 ('kjk', 799),
 ('jww', 784),
 ('/nn', 761),
 ('0wj', 752),
 ('^O^I:', 738),
 ('kkw', 725),
 ('ciw', 710),
 ('k0w', 683),
 (' lq', 683),
 ('^O^I^O^I', 670),
 ('^U^U^U', 653),
 ('jwj', 651),
 ('0ww', 644),
 ('kww', 640),
 ('gg/', 590),
 (':::', 590),
 (':kk', 589),
 ('nzz', 573),
 ('^O^Ik', 566),
 ('*nn', 561),
 ('wkk', 558)]


## TODO

1. Break up clumped tokens