In [49]:
import logging
import re
import os
import gensim
from gensim import corpora, models, similarities
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [80]:
CAMELCASE_TO_UNDERSCORE_RE = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
dictionary = corpora.dictionary.Dictionary()


class CodeWalker(object):
    
    stop_words = ['self', 'def', 'if', 'get', 'return', 'is', 'name', 'id', 'none', 'in', 'type', 'import', 'not', 'from', 'data', 'set', 'for', 'value', 'vehicle', 's', 'list', 'the', 'error', 'on', 'and', 'to', 'gui', 'else', 'key', 'letter', 'item', 'text', 'info', 'class', 'path', 'a', 'init', 'event', 'false', 'config', 'true', 'time', 'file', 'latin', 'view', 'result', 'settings', 'g', 'state', 'fort']
    
    def convert_camel_case_to_underscore(self, word):
        return CAMELCASE_TO_UNDERSCORE_RE.sub(r'_\1', word).lower()

    def clean_and_split_codewords(self, word):
        """
        HTTPResonse => http, response
        aNewWorld => a, new, world
        a_new_world => a, new, world

        """
        for subword in self.convert_camel_case_to_underscore(word).split('_'):
            if subword and len(subword) > 1 \
              and subword not in self.stop_words:
                yield subword
    

    def __iter__(self):
        for root, dirs, files in os.walk('/Users/camerondavidson-pilon/code/PyconCanada2015/scrapers/'):
            for file in files:
                if file.endswith('.py'):
                    with open(os.path.join(root, file), 'r') as open_file:
                        lines = open_file.read()
                        for raw_word in re.findall('(\w+)', lines):
                            for clean_words in self.clean_and_split_codewords(raw_word):
                                yield clean_words
                    
   

    def iter_dictionaries(self):
        for root, dirs, files in os.walk('/Users/camerondavidson-pilon/code/PyconCanada2015/scrapers/'):
            for file in files:
                if file.endswith('.py'):
                    with open(os.path.join(root, file), 'r') as open_file:
                        lines = open_file.read()
                        result = []
                        for raw_word in re.findall('(\w+)', lines):
                            for clean_words in self.clean_and_split_codewords(raw_word):
                                result.append(clean_words)
                    yield dictionary.doc2bow(result, allow_update=True)

In [81]:
from collections import Counter
counter = Counter()
for _ in CodeWalker():
    counter.update([_])

KeyboardInterrupt: 

In [70]:
print map(lambda r: r[0], counter.most_common(50))

['self', 'def', 'if', 'get', 'return', 'is', 'name', 'id', 'none', 'in', 'type', 'import', 'not', 'from', 'data', 'set', 'for', 'value', 'vehicle', 's', 'list', 'the', 'error', 'on', 'and', 'to', 'gui', 'else', 'key', 'letter', 'item', 'text', 'info', 'class', 'path', 'a', 'init', 'event', 'false', 'config', 'true', 'time', 'file', 'latin', 'view', 'result', 'settings', 'g', 'state', 'fort']


In [82]:
cw = CodeWalker()
matrix = [_ for _ in cw.iter_dictionaries()]

In [83]:
len(matrix)

3374

In [84]:
corpora.MmCorpus.serialize('../scrapers/data/test_corpus.mm', matrix)

In [85]:
mm = corpora.MmCorpus('../scrapers/data/test_corpus.mm')

In [86]:
print mm

MmCorpus(3374 documents, 45452 features, 460630 non-zero entries)


In [95]:
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=60, update_every=1, chunksize=10000, passes=6)



In [97]:
lda.print_topics(60)

[u'0.025*messenger + 0.025*iso8859 + 0.019*token + 0.011*arena + 0.011*contacts + 0.008*append + 0.008*current + 0.008*queue + 0.008*service + 0.007*servicechannelmessages',
 u'0.048*effect + 0.019*trigger + 0.017*chapter + 0.016*var + 0.014*section + 0.014*x0 + 0.013*flag + 0.013*target + 0.010*os + 0.010*conditions',
 u'0.042*fortifications + 0.014*building + 0.013*window + 0.012*shot + 0.011*turret + 0.010*clan + 0.010*buildings + 0.010*point + 0.010*matrix + 0.009*player',
 u'0.064*header + 0.054*tooltips + 0.041*body + 0.038*fortification + 0.017*battle + 0.016*tooltip + 0.012*cybersport + 0.010*common + 0.010*btn + 0.010*status',
 u'0.032*unit + 0.025*ctx + 0.020*battle + 0.020*request + 0.019*callback + 0.017*player + 0.014*clan + 0.011*cache + 0.011*building + 0.009*idx',
 u'0.022*items + 0.020*price + 0.016*xp + 0.015*make + 0.014*tankman + 0.013*credits + 0.013*cost + 0.013*veh + 0.013*string + 0.012*gold',
 u'0.065*filter + 0.018*filters + 0.016*length + 0.012*codec + 0.012*