In [1]:
from preprocessor import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
def build_feats(projects, category):
    
    # Find and print most common unigrams and bigrams in category
    grams = grams_by_category(projects, category, do_print=False)

    # Map grams to unique index for easy vectorization
    grams_to_idx = map_gram_to_idx(grams)

    # Map unique index to gram to quickly convert vectorization to txt
    idx_to_grams = [0] * len(grams_to_idx)

    for gram, idx in grams_to_idx.items():
        idx_to_grams[idx] = gram
        
    # Build feats + labels for model training
    feats = []
    labels = []

    for project in projects:
        if project['category'][0] == category or category == 'all':
            encoding = vectorize(project, grams_to_idx)

            # Label represents amt pledged
            label = project['pledged']

            feats.append(encoding)
            labels.append(label)
            
    return idx_to_grams, feats, labels

In [3]:
def create_model(projects, category, validate=False):
            
    idx_to_grams, feats, labels = build_feats(projects, category)
            
    # 90-10 split feats and labels; 90% training data and 10% test data
    feats_train = feats[:int(len(feats) * .9)]
    feats_test  = feats[int(len(feats) * .9):]

    labels_train = labels[:int(len(labels) * .9)]
    labels_test  = labels[int(len(labels) * .9):]
    
    model = linear_model.Ridge(alpha=1000)     # Initialize model
    model.fit(feats_train, labels_train)       # Train model
    
    # If validate=True, then validate model using 10% of data
    if validate:
        predictions = model.predict(feats_test)
        
        MSE = mean_squared_error(predictions, labels_test)
        print("MSE:", MSE)
        
    word_corrs = sorted(zip(idx_to_grams, model.coef_), key=lambda t: -t[1])
        
    return model, word_corrs

In [4]:
projects = read_data()

In [5]:
all_categories = []

# Get list of all possible categories
for project in projects:
    for category in project['category']:
        all_categories.append(category)
        
all_categories = nltk.FreqDist(all_categories)

# Get top-10 categories
top_10_categories = [category[0] for category in all_categories.most_common(10)]

In [21]:
grams = {}
coefs = []

for category in top_10_categories:
    temp = {}
    
    LR, corrs = create_model(projects,category)
    
    temp['grams'] = [t[0] for t in corrs]
    temp['monetary_impact'] = [t[1] for t in corrs]
    
    coefs.append([category, LR.intercept_,LR.coef_[-1]])
    
    grams[category] = pd.DataFrame(temp)

In [39]:
grams_df = pd.concat(grams, axis=1, keys=top_10_categories)
coefs_df = pd.DataFrame.from_records(coefs, columns=['category', 'intercept', 'goal_v_raised'], index='category')

In [43]:
grams_df.style

Unnamed: 0_level_0,film & video,film & video,music,music,technology,technology,art,art,publishing,publishing,food,food,games,games,fashion,fashion,design,design,comics,comics
Unnamed: 0_level_1,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact,grams,monetary_impact
0,animate,7135.24,new,1822.89,smart,27435.9,book,2367.05,art,1873.88,brewing,2554.74,board game,21557.3,jacket,12315.0,versatile,17811.5,hardcover,2816.9
1,documentary,5493.64,new album,1546.52,first,23702.4,museum,1970.46,100,1857.52,chef,2339.0,board,19698.8,travel,11158.3,travel,14145.7,volume,2062.72
2,bring,4630.89,join,1444.77,world first,18467.7,public,1492.97,book,1809.41,craft,2158.72,1,18777.7,pocket,10565.9,system,14012.9,webcomic,1876.21
3,big,4168.83,album,1416.66,camera,18423.4,public art,1456.45,world,1708.61,first,2070.67,set,17632.4,world,9592.25,pack,12650.4,collection,1846.6
4,back,4035.19,studio album,1248.74,affordable,18288.9,art book,1440.0,great,1673.66,beer,1816.63,new,14920.3,world,8642.41,smart,12282.7,year,1752.31
5,episode,3877.66,make,1247.32,world,15365.5,tarot,1431.07,game,1647.68,american,1742.31,4 player,14615.8,feature,8441.22,backpack,11381.5,death,1749.56
6,new,3685.78,record,1231.91,3d printer,15121.0,present,1424.81,art book,1643.33,kitchen,1647.4,world,13549.3,build,7788.33,world,10589.6,print,1625.98
7,...,3535.25,fan,1130.82,printer,15025.8,deck,1257.12,girl,1541.82,restaurant,1505.53,survival,13500.9,good,6125.06,carry,9761.11,book,1624.07
8,need help,3450.23,play,1081.51,3d,14962.3,bring,1256.71,artist,1531.49,home,1462.18,4,13179.2,dress,3369.0,line,8380.4,new,1534.95
9,star,3091.23,experience,1070.95,power,14859.3,feature,1169.65,coffee table,1321.95,base,1405.53,open,10224.8,back,3346.64,watch,7630.33,creator,1204.89


In [40]:
coefs_df.style

Unnamed: 0_level_0,intercept,goal_v_raised
category,Unnamed: 1_level_1,Unnamed: 2_level_1
film & video,11264.9,0.000232958
music,3753.9,0.00232471
technology,31611.7,0.00092937
art,3846.83,6.21405e-05
publishing,5630.16,0.000637451
food,6801.81,-4.17032e-05
games,29319.9,0.00566904
fashion,12938.7,0.0027301
design,33900.5,-0.000113414
comics,6558.99,0.238874


In [42]:
# How to access intercept or goal_v_raised
print(coefs_df.loc['art']['intercept'])

# How to access grams
print(grams_df['art']['grams'][0])

3846.829415258346
book



plt.figure(figsize=(20,10))
plt.bar(df_top['Gram'], df_top['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact") 
plt.show() 


plt.figure(figsize=(20,10))
plt.bar(df_bottom['Gram'], df_bottom['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact")
plt.figure(figsize=(20,10))
plt.show() 