In [409]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

from gensim.models import Doc2Vec
from scipy.spatial import distance
from nltk.corpus import stopwords
from gensim.models.doc2vec import LabeledSentence

import xgboost as xgb
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder
from scipy.spatial import distance
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score

from jyquickhelper import add_notebook_menu
add_notebook_menu(first_level=1)

# Load and clean data

In [394]:
data = pd.read_csv("/Users/eulalieformery/Desktop/Datacamp/kickstarter-projects/ks-projects-201801.csv")
data = data.dropna(subset=['name'])

In [395]:
# create binary label = 1 -> has reached the goal 0 -> did not reach it
data['Target'] = np.where( data['usd_pledged_real']>=data['usd_goal_real'], 1, 0)

In [396]:
data.index = np.arange(0, len(data))

# Create features

From Date

In [397]:
data['launched_date'] = pd.to_datetime(data['launched'], format='%Y-%m-%d %H:%M:%S')
data['deadline_date'] = pd.to_datetime(data['deadline'], format='%Y-%m-%d %H:%M:%S')

In [398]:
#length of project
data['length'] = data['deadline_date'] - data['launched_date']
data['length'] = [d.days for d in data['length']]

In [382]:
#features with month and year of launch
data['year'] = [d.year for d in data['launched_date']]
data['month'] = [d.month for d in data['launched_date']]
data['day'] = [d.day for d in data['launched_date']]

 From name

In [399]:
#length of name
data['name_length'] = [len(name) for name in data['name']]

#number of words
data['word_number'] = [len(name.split(' ')) for name in data['name']]
    
#ponctuation
data['question'] = (data.name.str[-1] == '?').astype(int)
data['exclamation'] = (data.name.str[-1] == '!').astype(int)

#upper
data['uppercase'] = data.name.str.isupper().astype(float)

In [408]:
#number of competitors 
#make categories for the goal

In [400]:
#create dummies for categorical features
main_category = pd.get_dummies(data['main_category'],prefix='mc')
category = pd.get_dummies(data['category'], prefix = 'cat')
country = pd.get_dummies(data['country'], prefix = 'country')
currency = pd.get_dummies(data['currency'], prefix = 'currency')

data_modified = pd.concat([data, main_category, category, country, currency], axis=1)

In [401]:
data.shape

(378657, 24)

Features to drop before classification:

    usd_pledged_real, deadline, launched, pledged, P>G, backers, ID

# Binary classification

In [402]:
y = data['Target']
X = data_modified[data_modified.columns.difference(['main_category','category','country','currency',
                                  'Target','name','deadline','deadline_date','launched_date',
                                  'launched','P>G','backers','pledged','state', 'usd pledged', 
                                  'usd_goal_real', 'usd_pledged_real','ID'])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [388]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


## XGBoost

In [390]:
xgboost_model = xgb.XGBRegressor(booster = 'gbtree',objective = 'binary:logistic', colsample_bytree = 0.9, learning_rate = 0.1,
                max_depth = 5, alpha =10, n_estimators = 50 , eval_metric = 'auc')

In [404]:
xgboost_model.fit(X_train, y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, eval_metric='auc', gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=50, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

**Results without embeddings**


In [405]:
y_pred = xgboost_model.predict(X_test)

print('accuracy', accuracy_score(y_test, [1 if a>0.5 else 0 for a in y_pred]))
print('log_loss',log_loss(y_test, y_pred))
print('roc',roc_auc_score(y_test, y_pred))
print('f1 Score',f1_score(y_test, [1 if a>0.5 else 0 for a in y_pred]))
print('precision', precision_score(y_test, [1 if a>0.5 else 0 for a in y_pred]))
print('recall', recall_score(y_test,[1 if a>0.5 else 0 for a in y_pred]))

accuracy 0.6938226750002001
log_loss 0.5751381063020411
roc 0.7350788832254196
f1 Score 0.45714204634135963
precision 0.6432022359752445
recall 0.3545738686388449


**Results with generated word2vec embeddings**

In [366]:
y_pred = xgboost_model.predict(X_test)

print('accuracy', accuracy_score(y_test, [1 if a > 0.5 else 0 for a in y_pred]))
print('log_loss',log_loss(y_test, y_pred))
print('roc',roc_auc_score(y_test, y_pred))
print('f1 Score',f1_score(y_test, [1 if a > 0.5 else 0 for a in y_pred]))
print('precision', precision_score(y_test, [1 if a > 0.5 else 0 for a in y_pred]))
print('recall', recall_score(y_test,[1 if a > 0.5 else 0 for a in y_pred]))

accuracy 0.7015213233352273
log_loss 0.5652065911418699
roc 0.7511580264024725
f1 Score 0.4769077572544564
precision 0.6572345278132127
recall 0.37422961789047365


**Results with google word2vec embeddings**

In [392]:
y_pred = xgboost_model.predict(X_test)

print('accuracy', accuracy_score(y_test, [1 if a > 0.5 else 0 for a in y_pred]))
print('log_loss',log_loss(y_test, y_pred))
print('roc',roc_auc_score(y_test, y_pred))
print('f1 Score',f1_score(y_test, [1 if a > 0.5 else 0 for a in y_pred]))
print('precision', precision_score(y_test, [1 if a > 0.5 else 0 for a in y_pred]))
print('recall', recall_score(y_test,[1 if a > 0.5 else 0 for a in y_pred]))

accuracy 0.6961834871195691
log_loss 0.5713749077191866
roc 0.7408835605723564
f1 Score 0.4604627366266841
precision 0.6497673672388897
recall 0.35657686212361334


In [65]:
names = data['name'].tolist()

# Word2vec embeddings

In [316]:
# remove dashes and apostrophes from punctuation marks 
punct = string.punctuation.replace('-', '').replace("'",'')
# regex to match intra-word dashes and intra-word apostrophes
my_regex = re.compile(r"(\b[-']\b)|[\W_]")

def clean_string(string, punct=punct, my_regex=my_regex, to_lower=False):
    if to_lower:
        string = string.lower()
    # remove formatting
    str = re.sub('\s+', ' ', string)
     # remove punctuation
    str = ''.join(l for l in str if l not in punct)
    # remove dashes that are not intra-word
    str = my_regex.sub(lambda x: (x.group(1) if x.group(1) else ' '), str)
    # strip extra white space
    str = re.sub(' +',' ',str)
    # strip leading and trailing white space
    str = str.strip()
    return str

cleaned_project_names = []
for idx, doc in enumerate(names):
    # clean
    doc = clean_string(doc, punct, my_regex, to_lower=True)
    # tokenize (split based on whitespace)
    tokens = doc.split(' ')

    # remove digits
    tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
    # remove tokens shorter than 3 characters in size
    tokens = [token for token in tokens if len(token)>1]
    # remove tokens exceeding 25 characters in size
    tokens = [token for token in tokens if len(token)<=25]
    cleaned_project_names.append(tokens)
    

In [72]:
%%time 

model = Word2Vec(cleaned_project_names, min_count=1, size=100, workers=8)

CPU times: user 35.3 s, sys: 354 ms, total: 35.6 s
Wall time: 16.1 s


In [73]:
print('Vocab size: %d' %len(model.wv.vocab))

Vocab size: 169317


In [326]:
name_matrix = np.zeros((len(cleaned_project_names),100),dtype="float32")

for i in range(len(cleaned_project_names)):
    try:
        name_matrix[i,]=model.wv[cleaned_project_names[i]].sum(0)/len(cleaned_project_names[i]) 
    except:
        #print(cleaned_project_names[i])
        name_matrix[i,]=np.zeros((1,100),dtype="float32")
        #print(name_matrix[i,])


In [328]:
name_embeddings = pd.DataFrame(name_matrix)

In [359]:
#new X with embeddings
X = pd.concat([X, name_embeddings], axis = 1)

# Google embeddings

In [370]:
my_q = 300 # to match dim of GNews word vectors
mcount = 5
w2v = Word2Vec(size=my_q, min_count=mcount)
w2v.build_vocab(cleaned_project_names)
w2v.intersect_word2vec_format('/Volumes/HIPPEULA 1/GoogleNews-vectors-negative300.bin.gz', binary=True)


In [374]:
name_matrix = np.zeros((len(cleaned_project_names),100),dtype="float32")

for i in range(len(cleaned_project_names)):
    try:
        name_matrix[i,]=w2v.wv[cleaned_project_names[i]].sum(0)/len(cleaned_project_names[i]) 
    except:
        name_matrix[i,]=np.zeros((1,100),dtype="float32")
   


In [387]:
name_embeddings = pd.DataFrame(name_matrix)
X = pd.concat([X, name_embeddings], axis = 1)

# Doc2Vec

In [124]:
id_list = [a for a in range(len(cleaned_project_names))]

In [126]:
labeled_comments = [LabeledSentence(words=[cleaned_project_names[i]], tags=[i]) for i in  range(len(cleaned_project_names))]

  """Entry point for launching an IPython kernel.


In [130]:
model_d2v = Doc2Vec(labeled_comments)

TypeError: unhashable type: 'list'

In [78]:
model.most_similar('love')

  """Entry point for launching an IPython kernel.


[('hope', 0.8702516555786133),
 ('god', 0.8322815895080566),
 ('words', 0.8144711852073669),
 ('heart', 0.8121780157089233),
 ('being', 0.7969840168952942),
 ('happiness', 0.7956464290618896),
 ('living', 0.7927098870277405),
 ('faith', 0.7927083969116211),
 ('mind', 0.7913191318511963),
 ('jesus', 0.7850102186203003)]

In [None]:
centroids = np.mean(, axis=0).reshape(1,-1)