# Text Modeling

In [24]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
import pathlib
import json
from datetime import datetime

import numpy as np
import pandas as pd
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to C:\Users\Han-chung
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


# 1. Data Loading

In [10]:
data_root = pathlib.Path('../data')

In [11]:
all_json_paths = list(data_root.glob('*.json'))
all_json_paths = [str(path) for path in all_json_paths]

In [12]:
all_json_paths

['..\\data\\Kickstarter_2019-01-17T03_20_02_630Z.json',
 '..\\data\\Kickstarter_2019-02-14T03_20_04_734Z.json',
 '..\\data\\Kickstarter_2019-03-14T03_20_12_200Z.json',
 '..\\data\\Kickstarter_2019-04-18T03_20_02_220Z.json',
 '..\\data\\Kickstarter_2019-05-16T03_20_20_822Z.json',
 '..\\data\\Kickstarter_2019-06-13T03_20_35_801Z.json',
 '..\\data\\Kickstarter_2019-07-18T03_20_05_009Z.json',
 '..\\data\\Kickstarter_2019-08-15T03_20_03_022Z.json']

In [13]:
%%time
data = []
for line in open(all_json_paths[0], 'r', encoding='utf8'):
    data.append(json.loads(line))
    
data = [record['data'] for record in data]
df = pd.DataFrame.from_records(data)

Wall time: 27 s


In [14]:
df.head()

Unnamed: 0,id,photo,name,blurb,goal,pledged,state,slug,disable_communication,country,...,location,category,profile,spotlight,urls,source_url,friends,is_starred,is_backing,permissions
0,320897621,{'key': 'assets/013/884/141/51d672393d941b28a9...,A Date in 2025 - A sci-fi comedy short film,"In the year 2025, a young man's superintellige...",4000.0,5265.82,successful,a-date-in-2025-a-sci-fi-comedy-short-film,False,US,...,"{'id': 2442047, 'name': 'Los Angeles', 'slug':...","{'id': 32, 'name': 'Shorts', 'slug': 'film & v...","{'id': 2696093, 'project_id': 2696093, 'state'...",True,{'web': {'project': 'https://www.kickstarter.c...,https://www.kickstarter.com/discover/categorie...,,,,
1,1504102568,{'key': 'assets/012/061/827/c4a4d3b58093abe6a7...,Replacement Place | New York Live Arts,Replacement Place needs your support to combat...,5000.0,5225.0,successful,replacement-place-new-york-live-arts,False,US,...,"{'id': 2459115, 'name': 'New York', 'slug': 'n...","{'id': 254, 'name': 'Performances', 'slug': 'd...","{'id': 1755245, 'project_id': 1755245, 'state'...",True,{'web': {'project': 'https://www.kickstarter.c...,https://www.kickstarter.com/discover/categorie...,,,,
2,284827571,{'key': 'assets/018/209/038/4d46d6bed9b83d4e9a...,Pumpkin Spice Love!,Pumpkin Spice Love - A Pumpkin Spice 4 Life 30...,250.0,362.0,successful,pumpkin-spice-love,False,US,...,"{'id': 2406949, 'name': 'Franklin', 'slug': 'f...","{'id': 262, 'name': 'Accessories', 'slug': 'fa...","{'id': 3132385, 'project_id': 3132385, 'state'...",True,{'web': {'project': 'https://www.kickstarter.c...,https://www.kickstarter.com/discover/categorie...,,,,
3,1384659155,{'key': 'assets/012/230/880/ae49c9a5a737288f0f...,Pirate Mama,Single mama and her 4-year old sail with all-f...,15000.0,15678.0,successful,pirate-mama-setting-sail-with-her-little-boy,False,US,...,"{'id': 2356566, 'name': 'Asheville', 'slug': '...","{'id': 13, 'name': 'Journalism', 'slug': 'jour...","{'id': 2079775, 'project_id': 2079775, 'state'...",True,{'web': {'project': 'https://www.kickstarter.c...,https://www.kickstarter.com/discover/categorie...,,,,
4,1244590413,{'key': 'assets/014/686/681/f58b7c12e1896e532c...,SparKit - Miniature Electrostatic Generator,SparKit wishes to produce kit-set Wimshurst Ma...,2500.0,11412.0,successful,sparkit-miniature-electrostatic-generator,False,NZ,...,"{'id': 2348327, 'name': 'Christchurch', 'slug'...","{'id': 334, 'name': 'DIY Electronics', 'slug':...","{'id': 2688334, 'project_id': 2688334, 'state'...",True,{'web': {'project': 'https://www.kickstarter.c...,https://www.kickstarter.com/discover/categorie...,,,,


In [15]:
cols = df.columns.to_list()

In [16]:
cols

['id',
 'photo',
 'name',
 'blurb',
 'goal',
 'pledged',
 'state',
 'slug',
 'disable_communication',
 'country',
 'currency',
 'currency_symbol',
 'currency_trailing_code',
 'deadline',
 'state_changed_at',
 'created_at',
 'launched_at',
 'staff_pick',
 'is_starrable',
 'backers_count',
 'static_usd_rate',
 'usd_pledged',
 'converted_pledged_amount',
 'fx_rate',
 'current_currency',
 'usd_type',
 'creator',
 'location',
 'category',
 'profile',
 'spotlight',
 'urls',
 'source_url',
 'friends',
 'is_starred',
 'is_backing',
 'permissions']

# 2. Text Data Transformation

There are two ways to tackle `name` and `blurb` text fields. First, we can train two separate supervised learning regression models that maps from preprocessed text to the `state` of the campaign, with successful being 1 and failed being 0. Those two models would then output a value with respect to the state of the campaign. We would then be able to use those values in the overall model with the other features to make an inference. This, in our opinion, will take significant amount of time to train.

The proposed way, is to transform both `name` and `blurb` into a feature vector using word2vec (or in this case, doc2vec) deep learning NLP model. The gensim library has pre-built word2vec, sentence2vec, and doc2vec models. So, instead of building our own model to output some score, we choose to first experiment with gensim's build-in functionalities to shorten the research time.

In [19]:
blurbs = df.blurb.tolist()

In [21]:
blurbs[:10]

["In the year 2025, a young man's superintelligent AI system tells him that he must go on a date or face certain suicide from loneliness.",
 'Replacement Place needs your support to combat some unexpected costs. Help us finish this exciting new performance project!',
 'Pumpkin Spice Love - A Pumpkin Spice 4 Life 30mm Gold Plated Enamel Pin!',
 'Single mama and her 4-year old sail with all-female crew, writing and photographing about environmental stewardship along the way.',
 'SparKit wishes to produce kit-set Wimshurst Machines (Electrostatic generator) for educational use which are simple, quick and reliable',
 'This funding is esential to completing a graduate thesis about the potential for eco-tourism to protect rainforests in Guyana',
 "Everyday for the next 40 days I will be creating a new digital portrait, I've done 50 so far & the feedbacks been absolutely amazing!",
 'Me& Magazine is a quarterly world culture/arts magazine featuring four different cities each issue. Politics, 

In [26]:
%time tagged_blurbs = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(blurbs)]

Wall time: 1min 2s


In [31]:
%%timeit
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_blurbs)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    %time model.train(tagged_blurbs, total_examples=model.corpus_count, epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

    model.save("d2v.model")

iteration 0
Wall time: 2min 8s
iteration 1
Wall time: 2min 9s
iteration 2
Wall time: 2min 7s
iteration 3
Wall time: 2min 5s
iteration 4
Wall time: 2min 12s
iteration 5
Wall time: 2min 6s
iteration 6
Wall time: 2min 6s
iteration 7
Wall time: 2min 8s
iteration 8
Wall time: 1min 59s
iteration 9
Wall time: 2min
iteration 10
Wall time: 1min 59s
iteration 11
Wall time: 2min
iteration 12
Wall time: 2min 1s
iteration 13
Wall time: 1min 57s
iteration 14
Wall time: 1min 58s
iteration 15
Wall time: 2min 3s
iteration 16
Wall time: 2min 4s
iteration 17
Wall time: 2min 1s
iteration 18
Wall time: 2min 8s
iteration 19
Wall time: 2min 12s
iteration 20
Wall time: 2min 9s
iteration 21
Wall time: 2min 6s
iteration 22
Wall time: 2min 4s
iteration 23
Wall time: 2min 4s
iteration 24
Wall time: 2min 1s
iteration 25
Wall time: 2min
iteration 26
Wall time: 2min 1s
iteration 27
Wall time: 1min 59s
iteration 28
Wall time: 2min 1s
iteration 29
Wall time: 2min 8s
iteration 30
Wall time: 2min 6s
iteration 31
Wall ti

KeyboardInterrupt: 

iteration 4


KeyboardInterrupt: 

iteration 5
Wall time: 1min 56s
iteration 6
Wall time: 1min 52s
iteration 7
Wall time: 1min 56s
iteration 8
Wall time: 1min 55s
iteration 9
Wall time: 1min 55s
iteration 10
Wall time: 1min 53s
iteration 11
Wall time: 2min 12s
iteration 12
Wall time: 2min 42s
iteration 13
Wall time: 2min 53s
iteration 14


KeyboardInterrupt: 

iteration 15


KeyboardInterrupt: 

KeyboardInterrupt: 