# DATA CLEANING - Initial steps


The below code reads in the raw CSV kickstarter data and then
* Removes stopwords and punctuation
* Lemmatizes all words (eg mice -> mouse, running -> run)
* Writes the processed data to JSON format

Lemmatization is a fairly intensive task and with a dataset of over 200,000 this needed to run overnight. 

The recommended usage is to skip this cell and work off of the already lemmatized JSON outputs included in the repository. 


In [2]:
import pandas as pd
import json
import glob
import random
import string
import spacy
import nltk
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")
stopwords = set(stopwords.words('english') + list(string.punctuation))

# Read in all of the CSV files, concatenate them into one dataset. 
csv_files = glob.glob("kickstarter_data/Kickstarter*")
subsets = []
for csv_file in csv_files:
    subsets.append(pd.read_csv(csv_file))
dset = pd.concat(subsets)

# Take in text and return an array of lemmatized, tokenized, and stopword-removed word features
def text_features(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    tokens = nlp(text)
    feats = []    
    for tok in tokens: # lemmatize words that are not pronouns 
        feats.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    feats = [feat for feat in feats if feat not in stopwords]
    return feats

# Goes through every kickstarter project in the dataset, and writes it back to disk in json format. 
dump = 0
projects = []
for idx, item in dset.iterrows():
    project = {'pledged' : item['pledged'] * item['fx_rate'],
              'goal'    : item['goal'] * item['fx_rate'],
              'category': json.loads(item['category'])['slug'].split("/"), 
              'text'    : str(item['name']) + " " + str(item['blurb']),
              'text_feats'    : text_features(str(item['name']) + " " + str(item['blurb']))}
    projects.append(project)
    if idx % 1000 == 999:
        with open('kickstarter_data/data' + str(dump) + '.json', 'w') as outfile:  
            json.dump(projects, outfile)
            dump += 1
            projects = []

ModuleNotFoundError: No module named 'spacy'

# Data cleaning - more interesting stuff

These are all the functions needed to convert the lemmatized/tokenized word features into a usable format for scikitlearn's regression models

Intended usage is to start running from here, downloading the json formatted features that are inside the repository. 

In [None]:
import json
import glob
import nltk
import random
# A constant for the top most useful uni-, bi-, and trigrams. Edit this to use more or less of each gram type. 
most_useful = {"uni": 200, "bi": 100, "tri": 0}

In [None]:
# **read_data()**
# - **Func Desc:**<br>
#     This function reads in the entire Kickstarter dataset from json files in the "kickstarter_data" directory.
# - **Return:**<br>
#     An nx5 list of projects, where n represents the total number of projects. Note that there are 5 attributes of a single project: the category, text, pledged amount, goal amount, and text_features.
def read_data():
    projects = []

    # Read in data
    json_files = glob.glob("kickstarter_data/data*")

    for json_file in json_files:
        projects += json.load(open(json_file, 'r'))

    return projects    

# **grams_by_project(*list text*)**
# - **Func Desc:**<br>
#     This function will find all the unigrams, bigrams, and trigrams in the given *text*.
# - **Return:**<br>
#     A dictionary containing all unigrams, bigrams, and trigrams, 
#     where the corresponding keys are "uni", "bi" and "tri"
def grams_by_project(text):
    grams = {}
    
    all_words = []
    all_bigrams = []
    all_trigrams = []
    
    prev_prev = ''
    prev_word = '<SOS>' # Start of sentence

    for w in text:
        # Ignore empty strings and apostrophe+s ending
        if w == "'s" or w == '’s' or w == '':  
            continue

        all_words.append(w)
        all_bigrams.append(prev_word + " " + w)

        if prev_prev != '':
            all_trigrams.append(prev_prev + " " + prev_word + " " + w)

        prev_prev = prev_word
        prev_word = w
    
    grams["uni"] = all_words
    grams["bi"]  = all_bigrams
    grams["tri"] = all_trigrams
    
    return grams

# **grams_by_category(*string category*, **[optional]** *int n*, **[optional]** *boolean do_print*)**
# - **Func Desc:**<br>
#     This function will find the unigrams, bigrams, and trigrams in the given *category*. If *do_print* is set, then the *n* most common unigrams, bigrams, and trigrams will be displayed.
# - **Return:**<br>
#     A dictionary containing all unigrams, bigrams, and trigrams, 
#     where the corresponding keys are "uni", "bi" and "tri"
def grams_by_category(projects, category, n=15, do_print=True):
    grams = {}
    
    all_words = []
    all_bigrams = []
    all_trigrams = []
    
    for project in projects:
        
        # Change this to check out a different sub-category, 
        # 'all' will check the entire thing
        if category != 'all' and category not in project['category']: 
            continue

        prev_prev = ''
        prev_word = '<SOS>' # Start of sentence
        
        proj_grams = grams_by_project(project['text_feats'])
            
        all_words += proj_grams["uni"]
        all_bigrams += proj_grams["bi"]
        all_trigrams += proj_grams["tri"]
        
    grams["uni"] = nltk.FreqDist(all_words)
    grams["bi"]  = nltk.FreqDist(all_bigrams)
    grams["tri"] = nltk.FreqDist(all_trigrams)
    
    if do_print:
        print("-- UNIGRAMS --")
        all_words = nltk.FreqDist(all_words)
        
        for word in all_words.most_common(n):
            print(word[0], "\t", word[1])

        print()
        print("-- BIGRAMS --")
        all_bigrams = nltk.FreqDist(all_bigrams)
        
        for bigram in all_bigrams.most_common(n):
            print(bigram[0], "\t", bigram[1])

        print()
        print("-- TRIGRAMS --")
        all_trigrams = nltk.FreqDist(all_trigrams)
        
        for trigram in all_trigrams.most_common(n):
            print(trigram[0], "\t", trigram[1])
    
    return grams


# **map_gram_to_idx(*dictionary grams*, **[optional]** num_uni, **[optional]** num_bi, **[optional]** num_tri)**
# - **Func Desc:**<br>
#     Given a dictionary of unigrams, bigrams, and trigrams, this function maps each gram to a unique index. We will later use this to vectorize the most unique uni-, bi-, and trigrams. Note that *num_uni* represents the "n" most common unigrams, and similarily for *num_bi* and *num_tri*.
# - **Return:**<br>
#     A dictionary containing all unigrams, bigrams, and trigrams mapped to a unique integer index.
def map_gram_to_idx(grams_dict, num_uni=most_useful["uni"], 
                      num_bi=most_useful["bi"], 
                      num_tri=most_useful["tri"]):
    gram_to_idx = {}
    count = 0
    
    for word, _ in grams_dict["uni"].most_common(num_uni):
        gram_to_idx[word] = count
        count += 1

    for phrase, _ in grams_dict["bi"].most_common(num_bi):
        gram_to_idx[phrase] = count
        count += 1

    for phrase, _ in grams_dict["tri"].most_common(num_tri):
        gram_to_idx[phrase] = count
        count += 1
        
    return gram_to_idx


# **vectorize(*list text*, *dictionary gram_to_idx*)**
# - **Func Desc:**<br>
#     For each uni-, bi-, and trigram in *text*, this function will indicate whether each gram is present in *gram_to_idx* (1: present; 0: not present). Note that *gram_to_idx* represents a mapping of the n most common uni-, bi-, and trigrams of a particular project category.
# - **Return:**<br>
#     A list of 0s and 1s, where 0 indicates that the gram found at *gram_to_idx[i]* is not present in *text* and 1 means that the gram is present.
def vectorize(project, gram_to_idx):
    text = project['text_feats']
    feats = [0] * (len(gram_to_idx) + 1)
    feats[-1] = project['goal']
    proj_grams = grams_by_project(text)
        
    for _, grams in proj_grams.items():
        for g in grams:
            if g in gram_to_idx:
                feats[gram_to_idx[g]] = 1
               
    return feats



# Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.metrics import mean_squared_error

Load in all of the data, and then find the top 10 most common kickstarter categories to analyze. 
We made the decision to limit analysis to category by category, so that we could get a more fine-grained description, indicating what kinds of projects are the most appealing for each category. 

In [None]:
def build_feats(projects, category):
    
    # Find and print most common unigrams and bigrams in category
    grams = grams_by_category(projects, category, do_print=False)

    # Map grams to unique index for easy vectorization
    grams_to_idx = map_gram_to_idx(grams)

    # Map unique index to gram to quickly convert vectorization to txt
    idx_to_grams = [0] * len(grams_to_idx)

    for gram, idx in grams_to_idx.items():
        idx_to_grams[idx] = gram
        
    # Build feats + labels for model training
    feats = []
    labels = []

    for project in projects:
        if project['category'][0] == category or category == 'all':
            encoding = vectorize(project, grams_to_idx)

            # Label represents amt pledged
            label = project['pledged']

            feats.append(encoding)
            labels.append(label)
            
    return idx_to_grams, feats, labels

In [None]:
def create_model(projects, category, validate=False):
            
    idx_to_grams, feats, labels = build_feats(projects, category)
            
    # 90-10 split feats and labels; 90% training data and 10% test data
    feats_train = feats[:int(len(feats) * .9)]
    feats_test  = feats[int(len(feats) * .9):]

    labels_train = labels[:int(len(labels) * .9)]
    labels_test  = labels[int(len(labels) * .9):]
    
    model = linear_model.Ridge(alpha=1000)     # Initialize model
    model.fit(feats_train, labels_train)       # Train model
    
    # If validate=True, then validate model using 10% of data
    if validate:
        predictions = model.predict(feats_test)
        
        MSE = mean_squared_error(predictions, labels_test)
        print("MSE:", MSE)
        
    word_corrs = sorted(zip(idx_to_grams, model.coef_), key=lambda t: -t[1])
        
    return model, word_corrs

In [None]:
projects = read_data()

In [None]:
all_categories = []

# Get list of all possible categories
for project in projects:
    for category in project['category']:
        all_categories.append(category)
        
all_categories = nltk.FreqDist(all_categories)

# Get top-10 categories
top_10_categories = [category[0] for category in all_categories.most_common(10)]

In [None]:
grams = {}
coefs = []

for category in top_10_categories:
    temp = {}
    
    LR, corrs = create_model(projects,category)
    
    temp['grams'] = [t[0] for t in corrs]
    temp['monetary_impact'] = [t[1] for t in corrs]
    
    coefs.append([category, LR.intercept_,LR.coef_[-1]])
    
    grams[category] = pd.DataFrame(temp)

In [None]:
grams_df = pd.concat(grams, axis=1, keys=top_10_categories)
coefs_df = pd.DataFrame.from_records(coefs, columns=['category', 'intercept', 'goal_v_raised'], index='category')

In [None]:
grams_df.style

In [None]:
coefs_df.style