In [1]:
import json
import glob
import nltk
import random

- **most_common_by_category(string *category*, int *n*, boolean *do_print*)**
    This function will find and print the 'n' most common unigrams, 
    bigrams, and trigrams in the given category
- **Return:**
    A dictionary containing all unigrams, bigrams, and trigrams, 
    where the corresponding keys are "uni", "bi" and "tri"

In [2]:
def most_common_by_category(category, n=15, do_print=True):
    grams = {}
    
    all_words = []
    all_bigrams = []
    all_trigrams = []
    
    for project in projects:
        
        # Change this to check out a different sub-category, 
        # 'all' will check the entire thing
        if category != 'all' and category not in project['category']: 
            continue

        prev_prev = ''
        prev_word = '<SOS>' # Start of sentence
        
        for w in project['text_feats']:
            # Ignore empty strings and apostrophe+s ending
            if w == "'s" or w == '’s'.decode('utf-8') or w == '':  
                continue
                
            all_words.append(w)
            all_bigrams.append(prev_word + " " + w)
            
            if prev_prev != '':
                all_trigrams.append(prev_prev + " " + prev_word + " " + w)
                
            prev_prev = prev_word
            prev_word = w
    
    grams["uni"] = all_words
    grams["bi"]  = all_bigrams
    grams["tri"] = all_trigrams
    
    if do_print:
        print "-- UNIGRAMS --"
        all_words = nltk.FreqDist(all_words)
        
        for word in all_words.most_common(n):
            print word[0], "\t", word[1]

        print
        print "-- BIGRAMS --"
        all_bigrams = nltk.FreqDist(all_bigrams)
        
        for bigram in all_bigrams.most_common(n):
            print bigram[0], "\t", bigram[1]

        print
        print "-- TRIGRAMS --"
        all_trigrams = nltk.FreqDist(all_trigrams)
        
        for trigram in all_trigrams.most_common(n):
            print trigram[0], "\t", trigram[1]
    
    return grams

In [3]:
projects = []

# Read in data
json_files = glob.glob("kickstarter_data/data*")

for json_file in json_files:
    projects += json.load(open(json_file, 'r'))
    
# Number of projects read-in
print len(projects)

208436


In [4]:
# Choose and print out a random sample from the set
i = random.randint(0, len(projects)) 

print projects[i]['text']
print
print projects[i]['text_feats']
print
print projects[i]['category']
print
print projects[i]['pledged'], "$ / ", projects[i]['goal'], "$"

Schatten - der Kurzfilm Als der Notrufdisponent Erik eines Tages einen dringenden Notruf entgegennimmt, ahnt er nicht was für Folgen dieser haben wird.

[u'schatten', u'der', u'kurzfilm', u'al', u'der', u'notrufdisponent', u'erik', u'eine', u'tage', u'einen', u'dringenden', u'notruf', u'entgegennimmt', u'ahnt', u'er', u'nicht', u'f\xfcr', u'folgen', u'dieser', u'haben', u'wird']

[u'film & video', u'shorts']

2306.08125827 $ /  2264.18256 $


In [5]:
# Find and print most common uni-, bi-, and trigrams in given 
# category
grams = most_common_by_category('technology')

-- UNIGRAMS --
app 	3460
world 	2451
make 	2287
use 	2100
3d 	1979
first 	1928
help 	1823
smart 	1769
new 	1740
create 	1662
design 	1548
build 	1543
system 	1327
device 	1321
easy 	1279

-- BIGRAMS --
world first 	804
3d printer 	744
raspberry pi 	577
open source 	533
3d print 	484
mobile app 	341
social medium 	287
easy use 	284
social network 	229
real time 	221
help us 	219
high quality 	196
make easy 	173
next generation 	160
virtual reality 	160

-- TRIGRAMS --
<SOS> world first 	77
world first smart 	56
<SOS> 3d print 	42
social medium platform 	41
arduino raspberry pi 	38
desktop 3d printer 	35
<SOS> raspberry pi 	31
raspberry pi arduino 	30
social medium app 	30
3d print filament 	29
raspberry pi zero 	29
app allow user 	28
board raspberry pi 	26
sla 3d printer 	26
<SOS> 3d printer 	26
