# Text Modeling

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
import pathlib
import json
from datetime import datetime

import numpy as np
import pandas as pd
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to C:\Users\Han-chung
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Data Loading

In [2]:
data_root = pathlib.Path('../data')

In [3]:
all_json_paths = list(data_root.glob('*.json'))
all_json_paths = [str(path) for path in all_json_paths]

In [4]:
all_json_paths

['..\\data\\Kickstarter_2019-01-17T03_20_02_630Z.json',
 '..\\data\\Kickstarter_2019-02-14T03_20_04_734Z.json',
 '..\\data\\Kickstarter_2019-03-14T03_20_12_200Z.json',
 '..\\data\\Kickstarter_2019-04-18T03_20_02_220Z.json',
 '..\\data\\Kickstarter_2019-05-16T03_20_20_822Z.json',
 '..\\data\\Kickstarter_2019-06-13T03_20_35_801Z.json',
 '..\\data\\Kickstarter_2019-07-18T03_20_05_009Z.json',
 '..\\data\\Kickstarter_2019-08-15T03_20_03_022Z.json']

In [5]:
%%time
data = []
for line in open(all_json_paths[0], 'r', encoding='utf8'):
    data.append(json.loads(line))
    
data = [record['data'] for record in data]
df = pd.DataFrame.from_records(data)

Wall time: 22.7 s


In [6]:
cols = df.columns.to_list()

# 2. Text Data Transformation

There are two ways to tackle `name` and `blurb` text fields. First, we can train two separate supervised learning regression models that maps from preprocessed text to the `state` of the campaign, with successful being 1 and failed being 0. Those two models would then output a value with respect to the state of the campaign. We would then be able to use those values in the overall model with the other features to make an inference. This, in our opinion, will take significant amount of time to train.

The proposed way, is to transform both `name` and `blurb` into a feature vector using word2vec (or in this case, doc2vec) deep learning NLP model. The gensim library has pre-built word2vec, sentence2vec, and doc2vec models. So, instead of building our own model to output some score, we choose to first experiment with gensim's build-in functionalities to shorten the research time.

In [7]:
blurbs = df.blurb.tolist()

In [8]:
blurbs[:10]

["In the year 2025, a young man's superintelligent AI system tells him that he must go on a date or face certain suicide from loneliness.",
 'Replacement Place needs your support to combat some unexpected costs. Help us finish this exciting new performance project!',
 'Pumpkin Spice Love - A Pumpkin Spice 4 Life 30mm Gold Plated Enamel Pin!',
 'Single mama and her 4-year old sail with all-female crew, writing and photographing about environmental stewardship along the way.',
 'SparKit wishes to produce kit-set Wimshurst Machines (Electrostatic generator) for educational use which are simple, quick and reliable',
 'This funding is esential to completing a graduate thesis about the potential for eco-tourism to protect rainforests in Guyana',
 "Everyday for the next 40 days I will be creating a new digital portrait, I've done 50 so far & the feedbacks been absolutely amazing!",
 'Me& Magazine is a quarterly world culture/arts magazine featuring four different cities each issue. Politics, 

In [9]:
%time tagged_blurbs = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(blurbs)]

Wall time: 49.6 s


In [10]:
for a in range(2):
    print(a)

0
1


In [11]:
# fixed learning rate alpha. set vec_size to 10 to reduce the number of the parameters
# for next layer model input.

max_epochs = 50
vec_size = 10
alpha = 0.025


model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                workers=8,
                epochs=max_epochs,
                min_alpha=alpha,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_blurbs)

for epoch in range(max_epochs):
    print(f'iteration {epoch}')
    %time model.train(tagged_blurbs, total_examples=model.corpus_count, epochs=model.epochs)
    
    model.save("d2v.model")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11


KeyboardInterrupt: 

In [12]:
model=Doc2Vec.load("d2v.model")

In [13]:
test1 = word_tokenize("I love kickstarter".lower())

In [14]:
v1 = model.infer_vector(test1)

In [15]:
v1

array([ 0.16322716, -0.20133977,  0.689475  , -0.5065823 ,  0.06438633,
       -0.82866377, -0.8632626 , -1.7119482 ,  0.54101616,  1.1054113 ],
      dtype=float32)

model= Doc2Vec.load("../models/d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])