In [2]:
from preprocessor import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [3]:
def build_feats(projects, category):
    
    # Find and print most common unigrams and bigrams in category
    grams = grams_by_category(projects, category, do_print=False)

    # Map grams to unique index for easy vectorization
    grams_to_idx = map_gram_to_idx(grams)

    # Map unique index to gram to quickly convert vectorization to txt
    idx_to_grams = [0] * len(grams_to_idx)

    for gram, idx in grams_to_idx.items():
        idx_to_grams[idx] = gram
        
    # Build feats + labels for model training
    feats = []
    labels = []

    for project in projects:
        if project['category'][0] == category or category == 'all':
            encoding = vectorize(project, grams_to_idx)

            # Label represents amt pledged
            label = project['pledged']

            feats.append(encoding)
            labels.append(label)
            
    return idx_to_grams, feats, labels

In [4]:
def create_model(projects, category, validate=False):
            
    idx_to_grams, feats, labels = build_feats(projects, category)
            
    # 90-10 split feats and labels; 90% training data and 10% test data
    feats_train = feats[:int(len(feats) * .9)]
    feats_test  = feats[int(len(feats) * .9):]

    labels_train = labels[:int(len(labels) * .9)]
    labels_test  = labels[int(len(labels) * .9):]
    
    model = linear_model.Ridge(alpha=1000)     # Initialize model
    model.fit(feats_train, labels_train)       # Train model
    
    # If validate=True, then validate model using 10% of data
    if validate:
        predictions = model.predict(feats_test)
        
        MSE = mean_squared_error(predictions, labels_test)
        print("MSE:", MSE)
        
    word_corrs = sorted(zip(idx_to_grams, model.coef_), key=lambda t: -t[1])
        
    return model, word_corrs

In [5]:
projects = read_data()

In [6]:
all_categories = []

# Get list of all possible categories
for project in projects:
    for category in project['category']:
        all_categories.append(category)
        
all_categories = nltk.FreqDist(all_categories)

# Get top-10 categories
top_10_categories = [category[0] for category in all_categories.most_common(10)]

In [7]:
grams = {}
coefs = []

for category in top_10_categories:
    temp = {}
    
    LR, corrs = create_model(projects,category)
    
    temp['grams'] = [t[0] for t in corrs]
    temp['monetary_impact'] = [t[1] for t in corrs]
    
    coefs.append([category, LR.intercept_,LR.coef_[-1]])
    
    grams[category] = pd.DataFrame(temp)

In [16]:
grams_df = pd.concat(grams, axis=1, keys=top_10_categories)
coefs_df = pd.DataFrame.from_records(coefs, columns=['category', 'intercept', 'goal_v_raised'], index='category')

In [19]:
grams_df.style

Unnamed: 0_level_0,film & video,film & video,music,music,technology,technology,art,art,publishing,publishing,food,food,games,games,fashion,fashion,design,design,comics,comics
Unnamed: 0_level_1,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact
0,animate,6882.5,new,1835.47,smart,28450.0,book,2194.01,100,2036.07,beer,2772.42,board game,23233.7,jacket,12214.9,versatile,17964.7,hardcover,3059.71
1,documentary,5837.72,new album,1583.49,first,21695.8,public art,1486.76,book,1848.76,brewing,2465.26,board,21967.6,travel,11035.3,travel,14472.1,volume,2276.57
2,bring,5132.63,join,1396.85,affordable,18691.0,public,1421.85,art,1833.42,chef,2131.37,1,19524.9,pocket,10377.2,pack,14362.1,collection,2016.12
3,big,4795.25,album,1388.37,world first,17252.8,bring,1336.66,art book,1647.73,first,2119.54,set,18530.0,world,9772.51,system,13438.2,year,1860.65
4,episode,3774.64,make,1303.57,camera,16507.8,art book,1302.92,game,1599.52,craft,1916.01,new,15865.0,world,8381.03,backpack,12407.0,webcomic,1829.93
5,back,3725.69,record,1302.32,printer,14542.9,deck,1297.47,great,1598.09,kitchen,1903.03,4 player,14602.3,feature,8166.54,world,10894.3,print,1822.17
6,new,3675.99,experience,1082.82,3d printer,14447.3,present,1250.69,girl,1533.22,american,1731.94,world,13549.7,build,8085.0,smart,10646.2,book,1660.0
7,...,3617.85,play,1028.37,power,14425.7,tarot,1245.5,world,1490.25,fresh,1500.58,survival,13263.8,good,5739.62,carry,9977.6,new,1527.54
8,need help,3388.9,part,1013.81,world,14250.2,black,1169.24,inspire,1457.56,home,1410.85,4,12833.3,hoodie,3178.4,line,8405.3,death,1498.99
9,star,2992.76,studio album,993.861,experience,14017.9,feature,1153.42,artist,1427.26,base,1357.36,expansion,10124.1,performance,3076.47,watch,7354.6,anthology,1336.92


In [40]:
coefs_df.style

Unnamed: 0_level_0,intercept,goal_v_raised
category,Unnamed: 1_level_1,Unnamed: 2_level_1
film & video,11264.9,0.000232958
music,3753.9,0.00232471
technology,31611.7,0.00092937
art,3846.83,6.21405e-05
publishing,5630.16,0.000637451
food,6801.81,-4.17032e-05
games,29319.9,0.00566904
fashion,12938.7,0.0027301
design,33900.5,-0.000113414
comics,6558.99,0.238874


In [42]:
# How to access intercept or goal_v_raised
print(coefs_df.loc['art']['intercept'])

# How to access grams
print(grams_df['art']['grams'][0])

3846.829415258346
book



plt.figure(figsize=(20,10))
plt.bar(df_top['Gram'], df_top['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact") 
plt.show() 


plt.figure(figsize=(20,10))
plt.bar(df_bottom['Gram'], df_bottom['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact")
plt.figure(figsize=(20,10))
plt.show() 