In [None]:
from preprocessor import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [None]:
def build_feats(projects, category):
    
    # Find and print most common unigrams and bigrams in category
    grams = grams_by_category(projects, category, do_print=False)

    # Map grams to unique index for easy vectorization
    grams_to_idx = map_gram_to_idx(grams)

    # Map unique index to gram to quickly convert vectorization to txt
    idx_to_grams = [0] * len(grams_to_idx)

    for gram, idx in grams_to_idx.items():
        idx_to_grams[idx] = gram
        
    # Build feats + labels for model training
    feats = []
    labels = []

    for project in projects:
        if project['category'][0] == category or category == 'all':
            encoding = vectorize(project, grams_to_idx)

            # Label represents amt pledged
            label = project['pledged']

            feats.append(encoding)
            labels.append(label)
            
    return idx_to_grams, feats, labels

In [None]:
def create_model(projects, category, validate=False):
            
    idx_to_grams, feats, labels = build_feats(projects, category)
            
    # 90-10 split feats and labels; 90% training data and 10% test data
    feats_train = feats[:int(len(feats) * .9)]
    feats_test  = feats[int(len(feats) * .9):]

    labels_train = labels[:int(len(labels) * .9)]
    labels_test  = labels[int(len(labels) * .9):]
    
    model = linear_model.Ridge(alpha=1000)     # Initialize model
    model.fit(feats_train, labels_train)       # Train model
    
    # If validate=True, then validate model using 10% of data
    if validate:
        predictions = model.predict(feats_test)
        
        MSE = mean_squared_error(predictions, labels_test)
        print("MSE:", MSE)
        
    word_corrs = sorted(zip(idx_to_grams, model.coef_), key=lambda t: -t[1])
        
    return model, word_corrs

In [None]:
projects = read_data()

In [None]:
all_categories = []

# Get list of all possible categories
for project in projects:
    for category in project['category']:
        all_categories.append(category)
        
all_categories = nltk.FreqDist(all_categories)

# Get top-10 categories
top_10_categories = [category[0] for category in all_categories.most_common(10)]

In [None]:
models = {}

for category in top_10_categories:
    temp = {}
    
    LR, corrs = create_model(projects,category)
    
    temp['grams'] = [t[0] for t in corrs[:10]] + [t[0] for t in corrs[-1:-10:-1]]
    temp['monetary_impact'] = [t[1] for t in corrs[:10]] + [t[1] for t in corrs[-1:-10:-1]]
    
    temp['intercept'] = LR.intercept_
    temp['goal_vs_raised'] = LR.coef_[-1]
    
    models[category] = pd.DataFrame(temp)

In [None]:
df = pd.concat(models, axis=1, keys=top_10_categories)

In [None]:
plt.figure(figsize=(20,10))
plt.bar(df_bottom['Gram'], df_bottom['Monetary Impact']) 
plt.xlabel("Gram") 
plt.ylabel("Monetary Impact")
plt.figure(figsize=(20,10))
plt.show()

In [None]:
# Actual project finder. If you're confused by a word, check this out
word = 'camera'
category = 'games'
for i in range(len(projects)):
    if word in projects[i]['text_feats'] and category in projects[i]['category']:
        print(projects[i]['text'])
        print(projects[i]['pledged'], "$ / ", projects[i]['goal'], "$")
        print()

In [None]:
sns.lineplot(x= 'Gram', y='Monetary Impact',data=df.head(10))

In [None]:
sns.barplot(x = 'Gram', y = 'Monetary Impact', data = df.head(10))