In [1]:
import pandas as pd
import numpy as np
import altair, seaborn

In [2]:
import json
import pprint
import time
import re
from datetime import datetime

In [3]:
import gensim
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to /Users/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import sklearn

In [5]:
%load_ext sql

In [6]:
%sql postgresql://mike:@localhost/bounties

  """)


'Connected: mike@bounties'

In [7]:
bounties_data_raw = %sql select * from std_bounties_bounty
fufillment_data_raw = %sql select distinct bounty_id, accepted from std_bounties_fulfillment where accepted 

 * postgresql://mike:***@localhost/bounties
955 rows affected.
 * postgresql://mike:***@localhost/bounties
458 rows affected.


In [8]:
bounties_df = bounties_data_raw.DataFrame()
fufillment_df = fufillment_data_raw.DataFrame()

In [9]:
print(len(fufillment_df))
print(len(set(fufillment_df.bounty_id)))

458
458


In [10]:
bounties_df.columns

Index(['id', 'bounty_id', 'created', 'modified', 'deadline', 'data', 'issuer',
       'arbiter', 'fulfillmentAmount', 'paysTokens', 'bountyStage',
       'old_balance', 'balance', 'title', 'description', 'bounty_created',
       'tokenSymbol', 'tokenDecimals', 'tokenContract', 'usd_price',
       'issuer_name', 'issuer_email', 'issuer_githubUsername',
       'issuer_address', 'sourceFileName', 'sourceFileHash',
       'sourceDirectoryHash', 'webReferenceURL', 'platform', 'schemaVersion',
       'schemaName', 'data_categories', 'data_issuer', 'data_json', 'token_id',
       'tokenLockPrice', 'calculated_balance', 'calculated_fulfillmentAmount',
       'user_id'],
      dtype='object')

In [11]:
set(bounties_df["platform"])

{'bounties-network', 'colorado', 'gitcoin', 'hiring'}

In [12]:
def get_skill_levels(row, md_fieldname):
    # TODO: this is very dangerous and needs to be fixed by changing the table schema/API but we'll tolerate it for now
    decoded = json.loads(eval(row))
    payload = decoded.get('payload', None)
    if payload == None:
        return None
    result = payload.get('metadata', {md_fieldname:''})[md_fieldname]
    if result == '':
        return None
    else:
        return result

In [13]:
def get_days_to_deadline(row):
    # implement some cleanup logic in time windows
    days_to_deadline = row/(60.0*60.0*24.0)
    # negative deadlines are mistakes due to API changes
    if days_to_deadline < 0:
        return 0
    # assume that anything > 5y is infinte
    if days_to_deadline > 5*365:
        return 0
    return days_to_deadline


In [14]:
bounties_df['experience_level'] = bounties_df.loc[:,'data_json'].apply(get_skill_levels, md_fieldname="experienceLevel")
bounties_df['bounty_type'] = bounties_df.loc[:,'data_json'].apply(get_skill_levels, md_fieldname="bountyType")
bounties_df['created_ts'] = bounties_df.loc[:,'bounty_created'].apply(lambda x: int(time.mktime(x.timetuple())))
bounties_df['deadline_ts'] = bounties_df.loc[:,'deadline'].apply(lambda x: int(time.mktime(x.timetuple())))
bounties_df['seconds_to_deadline'] = (bounties_df['deadline_ts'] - bounties_df['created_ts'])
bounties_df['days_to_deadline'] = bounties_df['seconds_to_deadline'].apply(get_days_to_deadline)

bounties_df = bounties_df.merge(fufillment_df[['accepted', 'bounty_id']], how='left', left_on='id', right_on='bounty_id')

raw_training_data = bounties_df[['id', 'days_to_deadline', 'description', 'data_categories', 
                                 'experience_level', 'bounty_type', 'platform', 'accepted', 'usd_price', 'title', 
                                'tokenSymbol']]

In [15]:
#?pd.Categorical

In [16]:
urls.findall("https://www.google.com/foobar?param=1&param=2")

NameError: name 'urls' is not defined

In [17]:
nonwords = re.compile('[^a-zA-Z \n]')
double_space = re.compile('\ {2,}')
double_newline = re.compile("\n{2,}")
urls = re.compile("https?://[0-9a-zA-Z\-\.\/\?\&\=]+")
def clean_text(row):
    urls_removed = urls.sub('', row).lower()
    non_markdown = nonwords.sub(' ', urls_removed)
    single_space = double_space.sub(' ', non_markdown)
    single_newline = double_newline.sub('\n', single_space)
    description_list = single_newline.split(" ")
    filtered_words = [word for word in description_list if word not in stopwords.words('english')]
    return " ".join(filtered_words)
    

In [19]:
# encode categorical features into ints (I think there might be a function to do this better)
platforms = list(set(raw_training_data.platform.values))
bounty_types = list(set(raw_training_data.bounty_type.values))
experience_levels = list(set(raw_training_data.experience_level.values))
tokens = list(set(raw_training_data.tokenSymbol.values))
print(platforms, bounty_types, experience_levels, tokens)

raw_training_data['platform_enc'] = raw_training_data.loc[:,'platform'].apply(lambda x: platforms.index(x))
raw_training_data['bounty_type_enc'] = raw_training_data.loc[:,'bounty_type'].apply(lambda x: bounty_types.index(x))
raw_training_data['experience_level_enc'] = raw_training_data.loc[:,'experience_level'].apply(lambda x: experience_levels.index(x))
raw_training_data['token_enc'] = raw_training_data.loc[:,'tokenSymbol'].apply(lambda x: tokens.index(x))

['colorado', 'hiring', 'bounties-network', 'gitcoin'] ['Code Review', None, 'Documentation', 'Bug', 'Improvement', 'Feature', 'Other', 'Security', 'Andere'] ['Beginner', None, 'Intermediate', 'Mittlere', 'Advanced'] ['LOVE', 'GIFT', 'GEN', 'WYV', 'DAI', 'ADT', 'AION', 'ETH', 'COLO', 'ANT', 'BNFT', 'CLN', 'ZRX', 'KIWI', 'LPT', 'TRX', 'MANA', 'AVO']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.

In [20]:
# print(raw_training_data.description.apply(lambda x: len(x.split())))

In [21]:
raw_training_data['description_clean'] = raw_training_data.loc[:,'description'].apply(clean_text)
raw_training_data['data_categories_clean'] = raw_training_data.loc[:,'data_categories'].apply(lambda x: [_.strip().lower() for _ in x])
raw_training_data['title_clean'] = raw_training_data.loc[:,'title'].apply(clean_text)
raw_training_data['description_length'] = raw_training_data.loc[:,'description_clean'].apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [22]:
# raw_training_data.description_length.value_counts()

In [23]:
print(len(raw_training_data))
# filter out bounties with no fulfillment
fulfilled_bounties_raw = raw_training_data[
    (raw_training_data.accepted == True) & 
    (raw_training_data.usd_price > 10)
]
# print(len(fulfilled_bounties))
print(len(fulfilled_bounties_raw))

955
435


In [24]:
categories_vocab = set()
for c in fulfilled_bounties_raw.data_categories_clean.values:
    [categories_vocab.add(_.strip()) for _ in c]
    
titles_vocab = set()
for c in fulfilled_bounties_raw.title_clean.values:
    [titles_vocab.add(_.strip()) for _ in c.split()]
    
description_vocab = set()
for c in fulfilled_bounties_raw.description_clean.values:
    [description_vocab.add(_.strip()) for _ in c.split()]

In [25]:
def tag_docs(docs, col):
    tagged = docs.apply(lambda r: TaggedDocument(words=simple_preprocess(r[col]), tags=[r.platform]), axis=1)
    return tagged

In [26]:
def train_doc2vec_model(tagged_docs, window, size):
    sents = tagged_docs.values
    doc2vec_model = Doc2Vec(sents, size=size, window=window, iter=20, dm=1)
    return doc2vec_model

In [27]:
tg_docs = tag_docs(fulfilled_bounties_raw, 'description_clean')

In [70]:
fulfilled_bounties_raw.head()
#titles_vocab

Unnamed: 0,id,days_to_deadline,description,data_categories,experience_level,bounty_type,platform,accepted,usd_price,title,tokenSymbol,platform_enc,bounty_type_enc,experience_level_enc,token_enc,description_clean,data_categories_clean,title_clean,description_length
1,656,0.0,### We need a Travis-CI / Docker expert to hel...,"[MARKET.js, MARKETProtocol, TypeScript, Jav...",Advanced,Feature,gitcoin,True,160.857,[Travis-CI] Set up travis and testing w/ truff...,ETH,3,5,4,7,need travis ci docker expert help one please ...,"[market.js, marketprotocol, typescript, javasc...",travis ci set travis testing w truffle ethere...,178
3,339,0.0,<!--\r\nHello!\r\n\r\nPlease use the template ...,"[website, MARKETProtocol, TypeScript, JavaS...",Beginner,Feature,gitcoin,True,16.969,[Layout] Overflow issue on macOS latest Chrome...,ETH,3,5,0,7,\nhello \n \nplease use template issue ideas ...,"[website, marketprotocol, typescript, javascri...",layout overflow issue macos latest chrome fir...,202
4,179,29.999572,"## Description\r\nCurrently, we have a notific...","[Python, Django, API, Backend, BountiesAPI...",Intermediate,Feature,gitcoin,True,85.4743,Build Slack Client Managing Slack Notifications,ETH,3,5,2,7,description \ncurrently notification channel ...,"[python, django, api, backend, bountiesapi, bo...",build slack client managing slack notifications,241
5,345,0.0,I found a curious result when upgrading Truffl...,"[solidity, ethereum, C++, Solidity, Shell,...",,,gitcoin,True,95.99,Optimizer seems to produce larger bytecode whe...,DAI,3,1,1,4,found curious result upgrading truffle solidit...,"[solidity, ethereum, c++, solidity, shell, cma...",optimizer seems produce larger bytecode run lo...,228
9,348,0.0,### Why Is this Needed?\r\n*Summary*: We have ...,"[website, MARKETProtocol, TypeScript, JavaS...",Intermediate,Feature,gitcoin,True,46.839,[Team / Advisors] Add new advisors to /team page.,ETH,3,5,2,7,needed \n summary added new advisors project ...,"[website, marketprotocol, typescript, javascri...",team advisors add new advisors team page,93


In [29]:
def vec_for_learning(doc2vec_model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [30]:
description_trained_model = train_doc2vec_model(tg_docs, 300, 300)



In [31]:
description_feature_vectors = vec_for_learning(description_trained_model, tg_docs)

In [32]:
categories_w2vmodel = gensim.models.Word2Vec([x for x in fulfilled_bounties_raw.data_categories_clean], size=100, 
                                             min_count=3, workers=4)
titles_w2vmodel = gensim.models.Word2Vec([x.split() for x in fulfilled_bounties_raw.title_clean], size=350, 
                                             min_count=3, workers=4)
description_w2vmodel = gensim.models.Word2Vec([x.split() for x in fulfilled_bounties_raw.description_clean], size=350, 
                                             min_count=3, workers=4)

In [33]:
# prints the top 10 most common words
print(categories_w2vmodel.wv.index2word[0:10])
# and then the bottom 10
print(categories_w2vmodel.wv.index2word[-30:])

['javascript', 'css', 'html', 'python', 'other', 'gitcoinco', 'shell', 'web', 'marketprotocol', 'typescript']
['django', 'codesponsor', 'assembly', 'solium', 'parity', 'paritytech', 'abis', 'trust-wallet-ios', 'uport-project', 'ethereumbook', 'remix-ide', 'augur-core', 'bigchaindb', 'blockchain', 'decentralization', 'backend', 'duaraghav8', 'frontend', 'market', 'truffle', '', 'casper', 'uport', 'translation', 'content creation', 'java', 'zos-cli', 'zeppelinos', 'docker', 'databrokerdao']


In [36]:
# prints the top 10 most common words
print(titles_w2vmodel.wv.index2word[0:10])
# and then the bottom 10
print(titles_w2vmodel.wv.index2word[-30:])

['page', 'gitcoin', 'create', 'add', 'design', 'user', 'contract', 'code', 'want', 'test']
['deposit', 'copy', 'move', 'know', 'getting', 'started', 'around', 'hard', 'ensure', 'date', 'beta', 'suite', 'guidelines', 'addresses', 'accounts', 'creating', 'building', 'position', 'redux', 'cli', 'logic', 'mock', 'links', 'understand', 'stop', 'document', 'simplified', 'glossary', 'terms', 'existing']


In [35]:
# prints the top 10 most common words
print(description_w2vmodel.wv.index2word[0:10])
# and then the bottom 10
print(description_w2vmodel.wv.index2word[-30:])

['work', 'please', 'issue', 'comment', 'user', 'bounty', 'bounties', 'would', 'like', 'gitcoin']
['aruba', 'minimum', 'debugging', 'craftabletoken', 'xcf', 'fb', 'dfd', 'rainbow', 'emj', 'rnbw', 'serpent', 'navbar', 'canvaselem', 'coala', 'hiding', 'documented', 'presskit', 'intake', 'crowdfunding', 'errorevent', 'expose', 'consolidate', 'talking', 'coming', 'convenience', 'callerallocationstartblock', 'dropped', 'soft', 'year', 'registered']


In [39]:
# most similar words fun
categories_w2vmodel.most_similar(positive=['javascript'], topn=5)

  


[('solidity', 0.526717483997345),
 ('typescript', 0.5257327556610107),
 ('ethereum', 0.5242995023727417),
 ('react', 0.47901347279548645),
 ('marketprotocol', 0.47395414113998413)]

In [40]:
titles_w2vmodel.most_similar(positive=['contract'], topn=5)

  """Entry point for launching an IPython kernel.


[('member', 0.13865996897220612),
 ('medium', 0.1322459876537323),
 ('translation', 0.12758858501911163),
 ('brand', 0.1251121312379837),
 ('move', 0.11978220194578171)]

In [65]:
# take a look at Keyword vectors
print(len(categories_w2vmodel.wv.vocab))
print(len(titles_w2vmodel.wv.vocab))
print(len(description_w2vmodel.wv.vocab))

86
227
2128


In [42]:
# averaging two feature vectors:
js = categories_w2vmodel.wv["javascript"]
css = categories_w2vmodel.wv["css"]
res = (js + css)/2

In [43]:
def average_categories_w2v_array(words):
#     print("Getting average w2v vector on %s words" % len(words))
    try:
        running_total = categories_w2vmodel.wv[words[0]]
    except (KeyError, IndexError):
        return np.zeros((100,))
    for w in words[1:]:
        try:
            this_w = categories_w2vmodel.wv[w]
        except KeyError:
            continue
        running_total = (running_total + this_w)/2
    return running_total

def average_titles_w2v_array(words):
    s = words.split()
    try:
        running_total = titles_w2vmodel.wv[s[0]]
    except (KeyError, IndexError):
        return np.zeros((350,))
    for w in s[1:]:
        try:
            this_w = titles_w2vmodel.wv[w]
        except KeyError:
            continue
        running_total = (running_total + this_w)/2
    return running_total

def average_description_w2v_array(words):
    s = words.split()
    try:
        running_total = description_w2vmodel.wv[s[0]]
    except (KeyError, IndexError):
        return np.zeros((350,))
    for w in s[1:]:
        try:
            this_w = description_w2vmodel.wv[w]
        except KeyError:
            continue
        running_total = (running_total + this_w)/2
    return running_total

def get_one_hot(targets, nb_classes):
    res = np.eye(nb_classes)[np.array(targets).reshape(-1)]
    return res.reshape(list(targets.shape)+[nb_classes])

In [44]:
X = np.empty((0, 1138))
Y = np.empty((0, 1))
#generating the actual feature matrices
counter = 0
for row in fulfilled_bounties_raw.iterrows():
    new_row = description_feature_vectors[1][counter]
#     new_row = np.empty((0,0))
    counter += 1
    stuff = row[1][['days_to_deadline','description_length']]
    new_row = np.concatenate((new_row, stuff.values), axis = None)
    el_one_hot = get_one_hot(np.array(row[1]['experience_level_enc']), len(experience_levels))
    bt_one_hot = get_one_hot(np.array(row[1]['bounty_type_enc']), len(bounty_types))
    pl_one_hot = get_one_hot(np.array(row[1]['platform_enc']), len(platforms))
    ts_one_hot = get_one_hot(np.array(row[1]['token_enc']), len(tokens))
    new_row = np.concatenate((new_row, el_one_hot, bt_one_hot, pl_one_hot, ts_one_hot), axis = None)
    new_row = np.concatenate((new_row, average_categories_w2v_array(row[1].data_categories_clean)), axis = None)
    new_row = np.concatenate((new_row, average_titles_w2v_array(row[1].title_clean)), axis = None)
    new_row = np.concatenate((new_row, average_description_w2v_array(row[1].description_clean)), axis = None)
    new_row.shape = (1,1138)
    new_Y = np.array([row[1].usd_price])
    new_Y.shape = (1,1)
    X = np.append(X, new_row, axis = 0)
    Y = np.append(Y, new_Y, axis = 0)

In [45]:
# for doc in raw_training_data.description_clean.values:
#     print(doc)
#     print("===")

In [64]:
# data is looking good! Let's convert split into test/train 
print(X.shape)
print(X[0])

(435, 1138)
[-0.1228170171380043 -0.11847779154777527 -0.22355018556118011
 -0.20592086017131805 -0.08221616595983505 0.1582321673631668
 -0.13416950404644012 -0.05251750722527504 0.06564129143953323
 -0.18456360697746277 0.20086140930652618 -0.2309207171201706
 -0.18878433108329773 -0.17173802852630615 -0.05997597053647041
 0.17925330996513367 -0.22005946934223175 0.08375915139913559
 -0.04582930728793144 0.025870664045214653 0.07130413502454758
 0.040442582219839096 -0.09598144143819809 -0.04740315303206444
 0.010337989777326584 -0.08824590593576431 0.05427797883749008
 -0.06480484455823898 0.0778065100312233 0.06687324494123459
 -0.010444937273859978 -0.05579565092921257 -0.03883872553706169
 -0.027383137494325638 0.08277624100446701 -0.16991154849529266
 0.16189944744110107 0.09718974679708481 -0.3596910536289215
 -0.06174356862902641 -0.0423104502260685 -0.13040348887443542
 0.09067187458276749 -0.10460947453975677 0.1353980302810669
 -0.020712224766612053 -0.018094651401042938 -0

In [47]:
test_size = 0.2 # retain 20% of the data for testing the model quality 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=int(time.time()))

In [48]:
print(X_train.shape)
print(X_test.shape)

(348, 1138)
(87, 1138)


In [49]:
# first, let's try XGB
# fit model no training data
model = XGBRegressor(max_depth=6, 
                     learning_rate = 0.1, 
                     n_estimators=35,
                     nthreads=6, 
                     subsample=0.95,
#                      colsample_bytree=0.95,
#                      colsample_bylevel=0.95,
                     reg_lambda = 0.1,
                     objective="reg:linear")
model.fit(X_train, Y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = mean_squared_error(Y_test, predictions)
print("MSE: %.2f" % (accuracy))

MSE: 2083408.63


In [101]:
from sklearn.model_selection import GridSearchCV
cv_params = {
    'max_depth':range(5,10),
    'learning_rate':[0.1,0.05,0.01], # , 0.05, 0.01
    'n_estimators':[10,15,20,25,30,35,40], #,60,70,80,90,100,150,250,500,750,1000],
#     'subsample': [x / 100.0 for x in range(5, 100, 5)],
#     'reg_lambda': [x / 100.0 for x in range(5, 100, 5)],
#     'reg_alpha': [x / 100.0 for x in range(5, 100, 5)]
    'subsample':[0.1, 0.95, 1],
    'reg_lambda':[0.1, 0.95, 1],
    'reg_alpha':[0.1, 0.95, 1]
}
constant_params = {
    'objective':"reg:linear"    
}

optimized_GBM = GridSearchCV(XGBRegressor(**constant_params), 
                            cv_params, 
                             scoring = 'neg_mean_squared_error', cv = 5, n_jobs = -1)
optimized_GBM.fit(X, Y)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': range(5, 10), 'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 15, 20, 25, 30, 35, 40], 'subsample': [0.1, 0.95, 1], 'reg_lambda': [0.1, 0.95, 1], 'reg_alpha': [0.1, 0.95, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [98]:
# first one
optimized_GBM.best_score_, optimized_GBM.best_estimator_, optimized_GBM.best_params_

(-279813.86841828667,
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=8, min_child_weight=1, missing=None, n_estimators=15,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0.95, reg_lambda=0.95, scale_pos_weight=1, seed=None,
        silent=True, subsample=0.1),
 {'learning_rate': 0.1,
  'max_depth': 8,
  'n_estimators': 15,
  'reg_alpha': 0.95,
  'reg_lambda': 0.95,
  'subsample': 0.1})

In [100]:
# second one
optimized_GBM.best_score_, optimized_GBM.best_estimator_, optimized_GBM.best_params_

(-332754.96577928879,
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=0.95),
 {'learning_rate': 0.1,
  'max_depth': 5,
  'n_estimators': 10,
  'reg_alpha': 1,
  'reg_lambda': 1,
  'subsample': 0.95})

In [102]:
# third one
optimized_GBM.best_score_, optimized_GBM.best_estimator_, optimized_GBM.best_params_

(-275517.76728744293,
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
        max_depth=5, min_child_weight=1, missing=None, n_estimators=35,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0.1, reg_lambda=0.95, scale_pos_weight=1, seed=None,
        silent=True, subsample=0.1),
 {'learning_rate': 0.05,
  'max_depth': 5,
  'n_estimators': 35,
  'reg_alpha': 0.1,
  'reg_lambda': 0.95,
  'subsample': 0.1})

In [50]:
# 168,034.5047 # all data, no tweaks
# 166,242.61 # all features, n_estimators= 100, objective=reg:linear, learning rate = 0.1, max_Depth = 10
# 162,728.84 # all features, learning rate 0.01
# 45,333.62 # No description, all other features
# 160,996.68 # no other features, only description + categories
# 28,957.50 # No descriptions, but titles instead, all other features, same settings as above
# 56752.40 # titles only, no categories, no descriptions, but all other features, same settings as above
# 20467.29 # 300 estimators, ALL data 
# 1,082,565.00 # 1000 estimators, learning rate 0.1, ALL data, XGBRegressor
# 1,115,145.64 # 1000 estimators, learning rate 0.1, ALL data
# # 500 estimators, all data, learning rate 0.1
# 175,561.24 # 500 estimator, XGBregressor, LR 0.01, 500 estimators, all data, depth 25
# 147,259.43 # 300 estimators, XGBRegressor, LR 0.01 all data, depth 25
# 119,705.72 # 200 estimators, XGBRegressor, LR 0.01 all data, depth 25
# 70187.55 # 100 estimators, XGBRegressor, LR 0.01 all data, depth 200
# 26543.79 # 10 estimators, XGBRegressor, LR 0.01 all data, depth 200
# 42073.34 # 50 estimators, XGBRegressor, LR 0.01 all data, depth 200
# 149,431.26 # 400 estimators, XGBClassifier, LR 0.01 all data, depth 200, no URLs
# 154,883.44 # 300 estimators, LR 0.1, max_depth 10, all data, no URLs
# 155,146.20 # 50 estimators, LR 0.1, max_depth 10, all data, no URLs
# 132,117.82 # Regressor, 50 estimators, LR 0.1, max_depth 10, all data, no URLs
# 131,624.80 # Regressor, 100 estimators, LR 0.1, max_depth 10, all data, no URLs
# 11,957.48 # Regressor, 100 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days
# 71,874.78 # Regressor, 100 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, subsample 0.8 everyhwere
#  73305.78 # Regressor, 300 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, subsample 0.8 everyhwere
#  73305.78 # Regressor, 600 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, subsample 0.8 everyhwere
#  34130.07 # Regressor, 20 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, subsample 0.8 everyhwere
#  23789.68 # Regressor, 10 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, subsample 0.8 everyhwere
#  38841.94 # Regressor, 10 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, subsample 0.8 everyhwere, L2 regularization 0.2
# 15,695.23 # Regressor, 10 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, L2 regularization 0.2
# 12054.98 # Regressor, 50 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, L2 regularization 0.2
# 12098.32 # Regressor, 100 estimators, LR 0.1, max_depth 10, all data, no URLs, nan for days, L2 regularization 0.2
# 11972.58 # Regressor, 35 estimators, LR 0.1, max_depth 9, all data, no URLs, nan for days, L2 regularization 0.2
# 17985.67 # Regressor, 35 estimators, LR 0.1, max_depth 9, all data, no URLs, 0 for days, L2 regularization 0.2
# 16395.58 # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2
# 1776342.76 # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2, min 50 words
# 1243293.09 # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2, min 10 words
# 1230813.40 # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2, min 1 words
# 38424.45 # # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2, description length
# 138534.83 # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2, description length, w2v on descriptions
# 22349.71 # Regressor, 35 estimators, LR 0.1, max_depth 10, all data, no URLs, 0 for days, L2 regularization 0.2, description length, w2v and doc2vec on descriptions
# 37264.06 # same as above, but with token symbol

In [80]:
np.set_printoptions(threshold=np.nan)
derp = y_pred.astype(int).reshape(Y_test.shape)
derp2 = np.concatenate((derp,Y_test), axis=1)
print(derp2[:5])

[[-247.       182.1072 ]
 [ 203.        92.667  ]
 [ 181.        11.4338 ]
 [ 464.       139.10107]
 [   7.       153.56   ]]


In [52]:
# Random Forest
# fit model no training data
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
    n_jobs = 3,
    n_estimators = 400
)
model.fit(X_train, Y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = mean_squared_error(Y_test, predictions)
print("MSE: %.2f" % (accuracy))

  


MSE: 208707.85


In [53]:
# 116343.36 # Defaults, all data, no URLs, 0 for days
# 78561.29 # 35 estimators, all data, no URLs, 0 for days
# 64177.63 # 300 estimators, all data, no URLs, 0 for days
# 81268.38 # 500 estimators, all data, no URLs, 0 for days
# 60672.96 # 400 estimators, all data, no URLs, 0 for days
# 1119551.45 # 400 estimators, all data, no URLs, 0 for days, min 1 words
# 1117895.17 # 500 estimators, all data, no URLs, 0 for days, min 1 words
# 154576.73 # 500 estimators, all data, no URLs, 0 for days, description length feature
# 154576.73 # 400 estimators, all data, no URLs, 0 for days, description length feature
# 57271.13 # 400 estimators, all data, no URLs, 0 for days, description length feature, both doc2vec and w2v on description
# 83522.60 # same as above, but with token symbol added

In [79]:
np.set_printoptions(threshold=np.nan)
derp = y_pred.astype(int).reshape(Y_test.shape)
derp2 = np.concatenate((derp,Y_test), axis=1)
print(derp2[:5])

[[-247.       182.1072 ]
 [ 203.        92.667  ]
 [ 181.        11.4338 ]
 [ 464.       139.10107]
 [   7.       153.56   ]]


In [78]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = mean_squared_error(Y_test, y_pred)
print("MSE: %.2f" % (accuracy))

MSE: 314852.41


In [56]:
# 275,618.60 # LR defaults,  all data, no URLs, 0 for days,
# 1,563,278.20 # LR defaults,  all data, no URLs, 0 for days, min 1 words
# 249669.04 # LR defaults, all data, no urls, 0 for days, word length feature, d2v and w2v on descriptions
# 180143.59 # same as above, but with token symbol added

In [76]:
from sklearn.linear_model import Ridge, RidgeCV
model = RidgeCV(cv=5)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = mean_squared_error(Y_test, y_pred)
print("MSE: %.2f" % (accuracy))

MSE: 176667.52


In [58]:
# 44317.58 # Ridge defaults, all data, no urls, 0 for days, word length feature, d2v and w2v on descriptions
# 69031.11 # same as above, with tokensymbol

In [75]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
model = LassoCV(cv=5)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = mean_squared_error(Y_test, y_pred)
print("MSE: %.2f" % (accuracy))

  y = column_or_1d(y, warn=True)


MSE: 156791.91


In [60]:
# 36993.80 # Lasso defaults, all data, no urls, 0 for days, word length feature, d2v and w2v on descriptions
# 64628.29 # same but with tokensymbol

In [103]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

model = ElasticNetCV(cv=5) # cross validating Elastic Net for optimal parameter search
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(model.alpha_)
# predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = mean_squared_error(Y_test, y_pred)
print("MSE: %.2f" % (accuracy))

  y = column_or_1d(y, warn=True)


3018.55966106
MSE: 156833.92


In [62]:
# 24858.35 # Elastic net defaults, all data, no urls, 0 for days, word length feature, d2v and w2v on descriptions
# 24530.13 # same, with tokensymbol

In [71]:
np.set_printoptions(threshold=np.nan)
derp = y_pred.astype(int).reshape(Y_test.shape)
derp2 = np.concatenate((derp,Y_test), axis=1)
print(derp2[:5])

[[ 193.       182.1072 ]
 [ 189.        92.667  ]
 [ 147.        11.4338 ]
 [ 192.       139.10107]
 [  93.       153.56   ]]


In [542]:
categories_w2vmodel.save("/Users/mike/src/notebooks/categories_w2v.pkl")

In [544]:
titles_w2vmodel.save("/Users/mike/src/notebooks/titles_w2v.pkl")

In [545]:
description_w2vmodel.save("/Users/mike/src/notebooks/description_w2v.pkl")

In [546]:
description_trained_model.save("/Users/mike/src/notebooks/description_d2v.pkl")

In [None]:
gensim