In [91]:
from preprocessor import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [92]:
projects = read_data()
category = "technology"

In [93]:
all_categories = []
for p in projects:
    for c in p['category']:
        all_categories.append(c)
        
all_categories = nltk.FreqDist(all_categories)
top_10_categories = [a[0] for a in all_categories.most_common(10)]
top_10_categories

['film & video',
 'music',
 'technology',
 'art',
 'publishing',
 'food',
 'games',
 'fashion',
 'design',
 'comics']

In [94]:
def create_model(projects, category, validate=False):
    # Find and print most common uni-, bi-, and trigrams in category
    grams = grams_by_category(projects, category, do_print=False)

    # Map grams to unique index for easy vectorization
    grams_to_idx = map_gram_to_idx(grams)

    # Map unique index to gram to quickly convert vectorization to txt
    idx_to_grams = [0] * len(grams_to_idx)

    for gram, idx in grams_to_idx.items():
        idx_to_grams[idx] = gram
        
    # Build feats + labels for model training
    feats = []
    labels = []

    for project in projects:
        if project['category'][0] == category or category == 'all':
            # Project encoding indicates which of the uni-, bi-, and 
            # trigrams in 'text_feats' are in the n-most common grams
            # for the category
            encoding = vectorize(project, grams_to_idx)

            # Label represents amt pledged
            label = project['pledged']

            feats.append(encoding)
            labels.append(label)
            
    # 90-10 split feats and labels; 90% training data and 10% test data
    feats_train = feats[:int(len(feats) * .9)]
    feats_test  = feats[int(len(feats) * .9):]

    labels_train = labels[:int(len(labels) * .9)]
    labels_test  = labels[int(len(labels) * .9):]
    
    LR = linear_model.Ridge(alpha=1000)
    LR.fit(feats_train, labels_train)
    
    if validate:
        predictions = LR.predict(feats_test)
        MSE = mean_squared_error(predictions,labels_test)
        print("MSE:", MSE)
        
    word_correlations = sorted(zip(idx_to_grams, LR.coef_), key=lambda t: -t[1])
        
    return LR, word_correlations

In [95]:
models = {}
pre_df = []
for category in top_10_categories:
    LR, corrs = create_model(projects,category)
    temp = [category, LR.intercept_ , LR.coef_[-1]] + corrs[:10] + corrs[-1:-10:-1]
    pre_df.append(temp)

In [96]:
df = pd.DataFrame.from_records(pre_df)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,film & video,10955.923719,0.258911,"(animate, 7229.730047285942)","(documentary, 5714.052116347978)","(bring, 5166.29364091792)","(episode, 3975.7493884791634)","(back, 3860.72222078205)","(need help, 3485.759006426724)","(new, 3485.4864020879977)",...,"(feature, 3016.3338214321375)","(cancel, -4703.069850922114)","(short, -3850.8004930011193)","(video, -3330.320169109004)","(de, -2267.5372761992885)","(student, -2144.397470671279)","(young, -2007.8846873405291)","(horror, -1890.2073236751055)","(web, -1885.0394063962096)","(pilot, -1866.823002613701)"
1,music,3729.431779,2.826989,"(new, 1789.7531034405279)","(new album, 1561.6907691142237)","(join, 1437.2805852879324)","(album, 1375.1395165652768)","(make, 1367.7003328538735)","(new studio, 1288.4135162132675)","(record, 1228.8285205078912)",...,"(part, 1016.0124035422992)","(cancel, -1726.669532279142)","(need, -1208.46421768457)","(look, -840.2693257320028)","(ep, -785.4229769898043)","(single, -716.7185123667435)","(money, -649.7810698543296)","(track, -618.3569667238083)","(fund, -561.9971141609083)","(music, -552.3857603594992)"
2,technology,33751.352663,0.908725,"(smart, 25175.11538673936)","(first, 19857.47414611077)","(camera, 18947.1491312961)","(affordable, 16643.27477805247)","(world first, 16273.255744297456)","(world, 14450.174480845932)","(power, 13952.153028433679)",...,"(3d printer, 13246.58717894954)","(cancel, -13611.479919771098)","(app, -12871.352256743465)","(project, -9155.182109661391)","(online, -8903.158915186925)","(website, -8370.935716177644)","(social, -7922.9662297420555)","(de, -7738.778868561689)","(people, -7714.339773780777)","(student, -7248.847948179168)"
3,art,3769.669883,0.062442,"(book, 1835.7850138781066)","(tarot, 1337.2876630200258)","(bring, 1334.9262076061607)","(build, 1321.1924593652861)","(deck, 1224.7146805056282)","(art book, 1208.38183782928)","(new, 1065.568218622071)",...,"(year, 847.8063948762682)","(cancel, -1204.8695143763825)","(need, -754.1648485115994)","(get, -724.6185080897635)","(make, -718.1993840971221)","(people, -675.2091317412156)","(show, -658.2275508802495)","(pin, -648.0406480618872)","(sticker, -643.8120156874489)","(love, -595.9819947931348)"
4,publishing,5627.475617,1.146637,"(art, 1841.0824868228626)","(great, 1816.5878789661103)","(artist, 1795.6466959661118)","(book, 1777.7837792825546)","(art book, 1621.0026674075577)","(fairy tale, 1415.972241174463)","(world, 1405.0174442284012)",...,"(guide, 1249.0010536702578)","(cancel, -2060.9924782481453)","(poetry, -2014.5236174320037)","(poem, -1220.3451700271796)","(young, -1139.3665464880894)","(novel, -1055.678264102214)","(publish, -998.0309267002945)","(write, -967.6773457319329)","(fund, -909.1263797670047)","(zine, -883.5039682583806)"
5,food,6872.90399,-0.04233,"(beer, 2786.841811436243)","(chef, 2583.612012426442)","(brewing, 2410.023292099982)","(first, 2143.049056936183)","(kitchen, 1992.4191719419937)","(craft, 1890.7827058497805)","(base, 1464.2902182179882)",...,"(build, 1114.743021275696)","(cancel, -2249.4068576799773)","(want, -1592.6833639238143)","(start, -1437.789669265388)","(truck, -1330.8184148924947)","(family, -1194.875981057306)","(food truck, -1056.272804729853)","(business, -977.1014116692149)","(project, -955.3863515105854)","(bbq, -953.0809722278077)"
6,games,28963.398328,5.439259,"(board game, 22016.526034408667)","(board, 20221.187524679328)","(1 4, 19082.176643505754)","(1, 17258.253737786024)","(set, 16912.72713954609)","(new, 15249.88983878687)","(4 player, 13856.52445501647)",...,"(4, 11851.656020598364)","(mobile, -10089.030549140001)","(fun, -10018.713787596924)","(card, -9131.538134566203)","(app, -7978.9295737979155)","(puzzle, -7045.758561344455)","(ios, -6573.157842673425)","(create, -5471.915976270632)","(mm, -5298.327500166351)","(way, -4576.818918890719)"
7,fashion,13416.765579,2.746983,"(jacket, 11323.034989461625)","(<SOS> world, 9365.3114945067)","(world, 8376.417023604346)","(world good, 7637.719504784307)","(feature, 7179.467214545411)","(good, 5780.89232500694)","(dress, 2961.599153019545)",...,"(shoe, 2207.7956983623594)","(pin, -3827.1038034953804)","(jewelry, -3543.3605182923525)","(cancel, -3511.094670176534)","(enamel, -2753.9303135784526)","(fashion, -2552.766699303274)","(clothing, -2459.4901630499203)","(enamel pin, -2451.780309343638)","(brand, -2390.202928451509)","(create, -1978.1868635567903)"
8,design,33889.059614,-0.081033,"(travel, 15482.146210541132)","(backpack, 12563.184072465112)","(system, 12482.36762484181)","(smart, 12042.388184882953)","(world, 11942.690610474092)","(carry, 10226.556201089497)","(ultimate, 7621.488860786829)",...,"(first, 5385.309149639385)","(poster, -9359.10209404803)","(cancel, -7679.5468700969495)","(print, -7458.237668718873)","(project, -6625.07710953252)","(need, -5700.46290860076)","(card, -5565.455361323658)","(help, -5131.760839633813)","(art, -4372.617998838419)","(pin, -4190.21349151643)"
9,comics,5062.352388,445.655804,"(hardcover, 2430.106053424853)","(collection, 1897.0825021844246)","(webcomic, 1859.0499893117408)","(volume, 1787.70327080149)","(book, 1710.6097227134742)","(death, 1524.8379134562242)","(print, 1453.4746825297907)",...,"(anthology, 1176.8042407781666)","(cancel, -1660.2013291997578)","(issue, -1329.764507960652)","(comic book, -1209.8572622912277)","(hero, -709.4136813051288)","(world, -699.8034665322297)","(horror, -686.2059689424366)","(action, -683.1554791069553)","(man, -663.7876594885148)","(want, -656.3252843269096)"


In [97]:
print( "Expected pledge amt. if given NO project txt: %.2f" % LR.intercept_)
print( "Increase in expected funding per dollar added to goal: %.3f" % (LR.coef_[-1] / 1000))
print() 

Expected pledge amt. if given NO project txt: 5062.35
Increase in expected funding per dollar added to goal: 0.446



In [None]:
df_top = pd.DataFrame(corrs[:10], columns=["Gram", "Monetary Impact"])
df_bottom = pd.DataFrame(corrs[-1:-10:-1], columns=["Gram", "Monetary Impact"])


In [None]:

plt.figure(figsize=(20,10))
plt.bar(df_top['Gram'], df_top['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact") 
plt.show() 

In [None]:

plt.figure(figsize=(20,10))
plt.bar(df_bottom['Gram'], df_bottom['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact")
plt.figure(figsize=(20,10))
plt.show() 

In [None]:
# Actual project finder. If you're confused by a word, check this out
word = 'camera'
category = 'games'
for i in range(len(projects)):
    if word in projects[i]['text_feats'] and category in projects[i]['category']:
        print(projects[i]['text'])
        print(projects[i]['pledged'], "$ / ", projects[i]['goal'], "$")
        print()

In [None]:
print(top_10_categories)