In [1]:
from preprocessor import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model

In [2]:
projects = read_data()
category = "games"

In [3]:
# Choose and print out a random sample from the set
i = random.randint(0, len(projects)) 

print(projects[i]['text'])
print()
print(projects[i]['text_feats'])
print()
print(projects[i]['category'])
print()
print(projects[i]['pledged'], "$ / ", projects[i]['goal'], "$")

The Crypto Tracker - The Ultimate Portfolio Tracker The Crypto Tracker is an Intelligent, Modern, and Advanced Cryptocurrency portfolio tracker.

['crypto', 'tracker', 'ultimate', 'portfolio', 'tracker', 'crypto', 'tracker', 'intelligent', 'modern', 'advanced', 'cryptocurrency', 'portfolio', 'tracker']

['technology', 'software']

940.0 $ /  15000.0 $


In [4]:
# Find and print most common uni-, bi-, and trigrams in category
grams = grams_by_category(projects, category, do_print=False)

# Map grams to unique index for easy vectorization
grams_to_idx = map_gram_to_idx(grams)

# Map unique index to gram to quickly convert vectorization to txt
idx_to_grams = [0] * len(grams_to_idx)

for gram, idx in grams_to_idx.items():
    idx_to_grams[idx] = gram

In [5]:
# Build feats + labels for model training
feats = []
labels = []

for project in projects:
    
    if project['category'][0] == category:
        # Project encoding indicates which of the uni-, bi-, and 
        # trigrams in 'text_feats' are in the n-most common grams
        # for the category
        encoding = vectorize(project['text_feats'], grams_to_idx)
        
        # Label represents amt pledged
        label = project['pledged']
        
        feats.append(encoding)
        labels.append(label)

In [6]:
# 90-10 split feats and labels; 90% training data and 10% test data
feats_train = feats[:int(len(feats) * .9)]
feats_test  = feats[int(len(feats) * .9):]

labels_train = labels[:int(len(labels) * .9)]
labels_test  = labels[int(len(labels) * .9):]

In [7]:
# Train model
LR = linear_model.Ridge(alpha=1000)
LR.fit(feats_train, labels_train)

Ridge(alpha=1000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [8]:
print( "Expected pledge amt. if given NO project txt: %.2f" % LR.intercept_)
print() 

Expected pledge amt. if given NO project txt: 29249.95



In [9]:
zipped = sorted(zip(idx_to_grams, LR.coef_), key=lambda t: -t[1])

df = pd.DataFrame(zipped, columns=["Gram", "Monetary Impact"])
df.style

Unnamed: 0,Gram,Monetary Impact
0,board game,20916.1
1,creator,19637.3
2,board,19110.6
3,1 4,17584.9
4,co,17320.2
5,set,16447.8
6,1 4 player,16238.1
7,1,15944.2
8,new,15052.1
9,co op,13170.1
