In [1]:
###: Importing H2O 

In [2]:
import h2o

In [3]:
###: Initalizing H2O cluster

In [4]:
h2o.init()

ERROR:h2o:Key init.version_check is not a valid config key


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 days 10 hours 17 mins
H2O cluster version:,3.14.0.7
H2O cluster version age:,1 month and 6 days
H2O cluster name:,H2O_started_from_R_avkashchauhan_ppg732
H2O cluster total nodes:,1
H2O cluster free memory:,2.974 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [5]:
###: Importing craigslist jobs title and description dataset into H2O cluster memory

In [6]:
jobs_data= h2o.import_file(col_names=['category', 'jobtitle'], header=1, col_types = ["enum", "string"],
                        path="https://raw.githubusercontent.com/h2oai/sparkling-water/rel-1.6/examples/smalldata/craigslistJobTitles.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
###: Understanding the dataset

In [8]:
jobs_data.summary()

Unnamed: 0,category,jobtitle
type,enum,string
mins,,
mean,,
maxs,,
sigma,,
zeros,,0
missing,0,0
0,education,After School Supervisor
1,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGES*****"
2,education,Bay Area Family Recruiter


In [9]:
###: Definition of stop words which will be removed from the list of words in out job details

In [10]:
STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what",
               "there","all","we","one","the","a","an","of","or","in","for","by","on",
               "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
               "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]

In [11]:
###: Local Tokenize function to tokenize all the words in a list

In [12]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [13]:
###: Calling the tokenize function to tokenize all the words in the job details 

In [14]:
##words_token = local_tokenize(all_sentences)
words_token = tokenize(jobs_data['jobtitle'])

In [15]:
###: Getting total number of tokens

In [16]:
print(len(words_token))

78924


In [17]:
###: The tokens type

In [18]:
type(words_token)

h2o.frame.H2OFrame

In [19]:
###: Understanding Words Token Data as H2O Dataframe

In [20]:
words_token.describe()

Rows:78924
Cols:1




Unnamed: 0,C1
type,string
mins,
mean,
maxs,
sigma,
zeros,0
missing,13845
0,after
1,school
2,supervisor


In [21]:
###: Importing H2O Word2VEC Estimator

In [22]:
from h2o.estimators.word2vec import H2OWord2vecEstimator

In [23]:
###: Setting Word2Vec estimator configuration

In [24]:
h2o_w2v_model =  H2OWord2vecEstimator(sent_sample_rate = .9, epochs = 10, model_id = "h2o_w2v_model_python")

In [25]:
###: Training Word2Vec Model

In [26]:
h2o_w2v_model.train(training_frame = words_token)

word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [27]:
###: Understanding Word2Vec Model

In [28]:
h2o_w2v_model

Model Details
H2OWord2vecEstimator :  Word2Vec
Model Key:  h2o_w2v_model_python
No model summary for this model





In [29]:
###: Testing simlar word Experiment from the model with the word "money"

In [30]:
h2o_w2v_model.find_synonyms( "money", count = 5)

OrderedDict([(u'rides', 0.7279952764511108),
             (u'fares', 0.7047621011734009),
             (u'more', 0.6669892072677612),
             (u'enjoy', 0.5241783857345581),
             (u'judgment', 0.4911399185657501)])

In [31]:
###: Testing simlar word Experiment from the model with the word "account"

In [32]:
h2o_w2v_model.find_synonyms( "account", count = 5)

OrderedDict([(u'liaison', 0.5699155330657959),
             (u'rockstar', 0.5654222369194031),
             (u'dispatch', 0.565384566783905),
             (u'emergency', 0.5629708170890808),
             (u'relations', 0.5588938593864441)])

In [33]:
###: Calculating vector for every job title by passing original job title category data 

In [34]:
job_title_vecs = h2o_w2v_model.transform(words_token, aggregate_method = "AVERAGE")

In [35]:
job_title_vecs.shape

(13845, 100)

In [36]:
job_title_vecs

C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99,C100
0.0179551,-0.00797802,0.0815321,0.349933,-0.0178768,0.0500604,0.318329,-0.159813,0.277998,-0.219391,0.192949,0.297739,0.0722817,-0.0560711,0.153139,-0.112632,0.192903,0.253662,0.103432,0.212026,-0.175511,0.192271,-0.172411,0.0183183,0.186244,-0.29612,0.307857,0.0347665,0.110348,-0.0591083,-0.296392,0.309013,-0.290719,-0.146808,-0.0632934,-0.19099,0.126214,-0.0523888,0.0522938,0.0643614,0.0384917,0.34891,-0.134788,-0.0729285,-0.314501,-0.0133686,-0.0478583,-0.20319,-0.413155,0.258811,-0.0735133,-0.0568653,-0.241278,0.314083,0.328764,-0.223345,0.257641,-0.247055,-0.0732505,-0.217212,0.496074,0.0379671,-0.0622384,0.150624,0.40383,-0.212764,0.00386726,-0.226638,0.175959,0.409737,0.0130717,0.105451,0.375842,0.078988,-0.0604978,0.0613854,0.285663,-0.21703,0.317409,0.257375,-0.158201,0.256061,-0.19042,0.137072,-0.213676,0.184386,0.177599,-0.463842,-0.270951,-0.201231,-0.282052,0.0816493,0.305745,-0.164701,0.0476012,0.128511,-0.19867,0.270144,-0.158304,-0.191211
0.198496,0.245932,0.182466,0.590663,0.530774,-0.389897,-0.00173687,-0.272299,-0.0296782,-0.131025,-0.140057,0.300605,-0.562641,-0.139242,-0.0978478,0.195237,0.249979,-0.147094,-0.141088,0.0572662,-0.0697324,-0.414618,0.0451773,0.0123859,-0.136952,-0.0700904,0.250318,0.191079,0.180666,0.0583874,-0.376978,0.0461544,-0.699312,0.173983,0.0955267,0.0935854,0.167072,-0.101795,0.116582,0.291421,0.204701,0.126056,0.400053,-0.0490436,-0.174219,0.448838,0.261596,0.0429197,-0.551901,-0.326154,-0.106464,-0.144055,0.0895844,0.395096,-0.248494,-0.207155,-0.0530967,-0.396468,0.289681,-0.164359,0.462974,0.13167,-0.45906,-0.322112,-0.0669346,-0.0275884,-0.00550211,-0.00746805,-0.102739,0.505109,0.0656397,0.295207,0.541028,-0.552472,0.239445,0.204306,0.0301934,-0.456681,0.322662,-0.00547394,0.321132,-0.265258,-0.335424,0.366437,-0.0687196,0.143206,-0.299311,0.0606533,-0.096279,-0.22924,-0.0533951,-0.108387,0.248719,0.114425,0.670337,-0.462584,-0.108296,-0.0541149,0.533516,0.135194
-0.279512,-0.0311573,0.0299129,0.139332,-0.188786,-0.162513,0.138589,0.0292393,0.210391,-0.0854794,-0.151968,0.352379,-0.0874313,-0.238089,-0.0754317,0.280004,-0.135037,-0.0122419,0.224876,-0.101882,-0.184141,-0.187821,0.167805,-0.0573033,-0.0675215,-0.115679,-0.000559647,0.221218,-0.0624837,0.367284,-0.288968,0.0154089,-0.0160931,0.054836,-0.156388,0.0201588,-0.122521,-0.116863,-0.198888,0.210559,0.114788,0.0229043,0.0972436,-0.170387,0.0957396,0.154114,-0.00425321,0.0699815,-0.177799,-0.121043,0.132781,-0.182174,-0.139764,0.00117804,-0.0180625,-0.121897,-0.218152,-0.373024,0.370585,-0.0274652,-0.154936,0.248075,0.0508673,-0.114537,0.00839755,-0.0361308,0.180554,0.21724,-0.0615257,-0.19364,0.0877388,-0.164708,0.155988,-0.220329,0.168677,0.0529324,0.163873,-0.135629,0.214686,0.0698141,-0.0667781,0.076631,0.0362696,0.383407,-0.0370228,-0.0298729,0.116152,0.056563,-0.0529871,-0.101769,0.0833967,-0.0436793,-0.176527,0.140998,0.141348,0.127693,-0.0852296,0.216663,-0.0940938,-0.130074
-0.18584,0.0589173,-0.0209236,0.32223,-0.328547,-0.0292025,0.141208,-0.113125,0.42417,0.123439,0.00357873,0.0485908,-0.169337,-0.160606,-0.0947159,-0.00983132,-0.224349,0.105742,-0.2903,0.158163,0.173661,0.115932,0.0255834,-0.0720259,0.0558388,-0.126619,0.122635,0.0349655,-0.156523,0.0614013,-0.267123,0.0665683,-0.0753392,-0.187512,-0.210097,-0.236664,-0.0322936,0.0496863,0.169302,-0.0962571,0.232475,0.180473,0.0783525,0.0707957,-0.118951,0.122508,0.0507294,0.19402,-0.179745,0.0475672,-0.0552222,-0.142431,-0.0151779,-0.104452,-0.0661279,0.0224244,0.0312314,-0.375251,0.245219,-0.142491,0.0860295,0.0130573,0.165486,0.0456134,0.0480496,0.00788409,0.248232,0.0814388,0.118093,0.0560418,-0.0970279,0.232667,0.0923072,0.0270373,-0.0979416,-0.0652559,0.24502,-0.0761419,0.377078,0.0946181,0.154758,0.32353,-0.0764646,0.195065,-0.152544,0.162256,0.127673,-0.276349,-0.227517,-0.207073,-0.190367,0.217468,0.223551,0.11492,0.184903,-0.0880004,-0.219669,0.0458589,-0.110105,-0.102114
-0.277419,-0.188838,-0.120025,0.436367,-0.203508,-0.0127781,0.0372628,-0.0647706,-0.0353567,0.0639184,-0.0988843,0.105484,0.0708833,-0.0285482,-0.0324635,0.126827,-0.3059,0.222194,0.00157749,0.109845,-0.110494,0.326039,-0.0237437,-0.088954,0.0761097,0.00970035,0.0827787,-0.20139,-0.112345,-0.0860417,-0.420042,0.351167,0.253074,0.235557,-0.448511,-0.0476538,-0.273188,0.106895,-0.0351234,0.106686,0.306161,0.201133,0.0164421,-0.373106,-0.233002,-0.227771,-0.010907,-0.14317,-0.277223,0.0389231,0.142493,-0.36876,-0.023672,0.0699159,0.228254,-0.139418,0.183171,-0.41104,0.031216,-0.015824,0.130081,0.164182,0.0984419,-0.114311,-0.0103982,-0.0850711,0.0845731,0.29402,0.151418,0.193019,-0.180165,-0.0385596,0.293083,-0.287285,-0.0978478,-0.0506869,-0.00784257,-0.116928,0.0141002,-0.135356,-0.0679999,0.0573933,-0.00119584,-0.103944,-0.180585,-0.0567569,0.0101651,-0.035179,0.183764,-0.38203,-0.0687278,0.0302355,0.156293,0.214381,0.0213643,0.274187,-0.346777,0.152309,-0.165817,-0.21063
-0.0968212,0.294803,0.108441,0.24607,0.320892,-0.199162,0.0234883,-0.0825039,0.38038,-0.199475,0.0980913,0.0933711,-0.421758,-0.227256,-0.0483002,0.138216,-0.0442202,0.235719,0.0267283,0.223356,-0.147654,0.0544773,-0.05055,-0.196758,-0.00339076,-0.209793,0.161366,-0.041358,-0.0114659,0.00848143,-0.248748,-0.103031,-0.391771,-0.185089,-0.245407,-0.0849143,-0.178004,-0.0888006,-0.222098,-0.0222894,0.0506169,0.186504,0.133255,-0.0352811,-0.0442291,0.13697,0.121009,0.0356257,-0.412744,-0.108332,0.0520442,-0.190164,-0.0769488,0.0573796,-0.0659482,-0.160153,-0.0426574,-0.284972,0.293634,-0.166734,0.178156,0.140884,-0.395717,-0.121784,-0.027836,-0.157678,0.0819995,-0.0874873,-0.114391,0.138773,0.131223,0.304295,0.125378,-0.0588897,-0.0537653,0.24083,0.229498,-0.0479279,0.315786,-0.0639527,0.133163,0.0952082,-0.088019,0.27418,0.0653943,0.172416,0.011494,-0.075446,-0.203255,-0.162569,-0.0842942,0.157566,0.272782,0.148342,0.147276,-0.172726,-0.176766,0.0312758,-0.130015,-0.128402
-0.256394,0.187985,-0.152656,0.241329,0.0926101,0.0552791,-0.134375,-0.302937,0.238359,-0.224368,-0.078545,0.00287616,-0.211201,0.00483979,-0.0427184,0.0852081,-0.0259122,0.341753,0.175156,0.130609,-0.193145,0.0737868,0.129889,0.105964,0.22039,-0.396624,0.122866,-0.130831,-0.0147741,0.257713,-0.237331,-0.19485,-0.153565,-0.175024,-0.0845617,-0.0766987,-0.442221,-0.265412,-0.202735,-0.22755,-0.281462,0.130195,0.106494,0.0529705,0.0291641,0.330571,-0.226512,-0.18096,-0.225939,0.277189,0.415133,-0.427556,-0.31146,0.374276,-0.0605311,0.207564,0.139683,-0.109739,0.299998,0.279649,0.201389,0.296182,-0.262822,0.112562,0.216622,-0.188039,0.133852,0.0230622,0.145121,0.236297,0.0979985,-0.166176,0.105591,-0.300431,-0.10962,0.34708,0.255217,0.16908,0.480311,-0.123604,-0.368489,0.20797,0.249493,0.35134,0.12131,-0.0655702,0.158266,-0.165378,-0.0649255,-0.103817,-0.091383,-0.198516,0.198022,0.108721,0.0588654,0.198717,0.00447548,0.0587266,0.151778,0.0765489
0.198496,0.245932,0.182466,0.590663,0.530774,-0.389897,-0.00173687,-0.272299,-0.0296782,-0.131025,-0.140057,0.300605,-0.562641,-0.139242,-0.0978478,0.195237,0.249979,-0.147094,-0.141088,0.0572662,-0.0697324,-0.414618,0.0451773,0.0123859,-0.136952,-0.0700904,0.250318,0.191079,0.180666,0.0583874,-0.376978,0.0461544,-0.699312,0.173983,0.0955267,0.0935854,0.167072,-0.101795,0.116582,0.291421,0.204701,0.126056,0.400053,-0.0490436,-0.174219,0.448838,0.261596,0.0429197,-0.551901,-0.326154,-0.106464,-0.144055,0.0895844,0.395096,-0.248494,-0.207155,-0.0530967,-0.396468,0.289681,-0.164359,0.462974,0.13167,-0.45906,-0.322112,-0.0669346,-0.0275884,-0.00550211,-0.00746805,-0.102739,0.505109,0.0656397,0.295207,0.541028,-0.552472,0.239445,0.204306,0.0301934,-0.456681,0.322662,-0.00547394,0.321132,-0.265258,-0.335424,0.366437,-0.0687196,0.143206,-0.299311,0.0606533,-0.096279,-0.22924,-0.0533951,-0.108387,0.248719,0.114425,0.670337,-0.462584,-0.108296,-0.0541149,0.533516,0.135194
-0.255606,0.146987,0.0875801,0.174245,0.206106,-0.311875,0.182975,-0.0984452,0.332283,-0.159374,-0.183113,0.272893,-0.304552,-0.379293,-0.134616,0.293861,0.184369,0.121031,0.135055,-0.092751,-0.314573,-0.170899,0.289209,-0.0648469,-0.119034,-0.0990818,0.145423,0.153085,-0.0289218,0.262371,-0.26442,-0.097153,-0.296988,0.0390756,-0.0858309,0.113852,-0.178196,-0.0761537,-0.15718,0.15832,-0.0649857,0.201172,0.17167,-0.104951,0.0848249,0.333333,0.0707429,0.0203472,-0.46256,-0.139642,-0.0392542,-0.113784,-0.127358,0.345009,-0.181097,-0.220222,-0.11546,-0.233171,0.450943,-0.0394981,0.194819,0.290414,0.0225167,-0.322798,-0.041635,-0.0131174,0.165315,0.0839527,-0.00356718,0.09596,0.159932,0.158423,0.280063,-0.292947,-0.0253924,0.113567,0.23039,-0.0894585,0.298345,-0.0426372,0.0639109,0.0565473,-0.0204912,0.320419,0.18378,0.0823786,-0.001967,0.0307857,-0.0947001,-0.148871,-0.0619269,-0.0397579,0.189115,0.140936,0.389256,-0.037986,-0.11576,0.0440138,0.21192,-0.0729424
-0.227742,-0.0551087,0.166079,0.392908,0.0124986,-0.119152,0.357413,-0.350363,0.267591,-0.320641,0.0485993,0.319865,-0.132096,0.0206106,0.0810479,0.0391307,-0.208353,0.301979,0.044181,0.232225,-0.0484336,0.122579,-0.214312,0.025111,-0.0895493,-0.275654,0.352035,-0.102453,-0.0212924,-0.0648532,-0.345202,0.251677,-0.0843749,0.105292,-0.374871,-0.446384,0.238704,-0.188066,0.0425713,-0.124362,0.0604407,0.102252,-0.0620847,-0.071188,-0.0213661,0.166173,0.169893,-0.187628,-0.638503,0.19506,0.0866197,-0.0958519,-0.0439426,0.0900624,0.347886,-0.291771,0.181137,-0.343609,0.414301,0.116237,0.399837,0.0945345,0.0569895,0.0594818,0.314078,0.0183588,-0.0471618,0.216848,0.146062,-0.0876198,0.134697,0.227044,0.154305,-0.0832328,0.0689596,-0.0272637,0.0524815,-0.0668715,0.216497,0.277281,0.318314,0.415891,-0.303443,0.241663,0.0582422,0.257931,0.0709711,-0.0529635,-0.27733,-0.152813,-0.0290373,-0.0792733,0.243187,0.228959,0.415549,0.11022,-0.303159,0.272885,0.00821701,-0.146179




In [37]:
jobs_data.shape

(13845, 2)

In [38]:
jobs_data['category'].shape

(13845, 1)

In [39]:
###: Checking if any of the job title vectors are empty or NAs

In [40]:
valid_job_titles = ~ job_title_vecs["C1"].isna()
valid_job_titles

not(isNA(C1))
1
1
1
1
1
1
1
1
1
1




In [41]:
###: Combining categories with valid job data to generate the final data set which will be used to build H2O GBM Model

In [42]:
#final_words_data = jobs_data['category'].cbind(job_title_vecs)
final_words_data = jobs_data[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])

In [43]:
###: Creating training, validation & test dataset from the source dataset

In [44]:
train, valid, test = final_words_data.split_frame(ratios=[0.8, 0.1])

In [45]:
###: Setting the response column

In [46]:
response = "category"

In [47]:
###: Setting the response column as enum/categorical in all train, valid and test dataset

In [48]:
train[response] = train[response].asfactor()
valid[response] = valid[response].asfactor()
test[response] = test[response].asfactor()

In [49]:
###: Setting features columns list for training

In [50]:
features = train.col_names
features.remove(response)

In [51]:
###: Importing H2O GBM Estimator

In [52]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [53]:
###: Setting H2O GBM configuration

In [54]:
h2o_gbm = H2OGradientBoostingEstimator(nfolds = 5)

In [55]:
###: Traning the H2O GBM model based on jobs data

In [56]:
h2o_gbm.train(x = features, y = response, training_frame= train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [57]:
###: H2O GBM Model performance

In [58]:
h2o_gbm.model_performance(xval=True)


ModelMetricsMultinomial: gbm
** Reported on cross-validation data. **

MSE: 0.182693711286
RMSE: 0.427426849046
LogLoss: 0.577215565131
Mean Per-Class Error: 0.204715957269
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
accounting,administrative,customerservice,education,foodbeverage,labor,Error,Rate
1005.0,167.0,52.0,10.0,9.0,14.0,0.2004773,"252 / 1,257"
96.0,1568.0,202.0,45.0,31.0,40.0,0.2088799,"414 / 1,982"
38.0,202.0,1253.0,50.0,121.0,185.0,0.3223364,"596 / 1,849"
11.0,65.0,45.0,1797.0,18.0,15.0,0.0789339,"154 / 1,951"
6.0,55.0,119.0,18.0,1598.0,199.0,0.1989975,"397 / 1,995"
15.0,49.0,162.0,33.0,172.0,1540.0,0.2186707,"431 / 1,971"
1171.0,2106.0,1833.0,1953.0,1949.0,1993.0,0.2039073,"2,244 / 11,005"


Top-6 Hit Ratios: 


0,1
k,hit_ratio
1,0.7960927
2,0.9186733
3,0.9664698
4,0.9868242
5,0.9963652
6,0.9999999




In [59]:
###: Performing the prediction with the test dataset

In [60]:
pred_result = h2o_gbm.predict(test_data=test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [61]:
###: Checking the prediction results

In [62]:
pred_result

predict,accounting,administrative,customerservice,education,foodbeverage,labor
education,0.0278986,0.124369,0.108486,0.662066,0.0291289,0.048052
education,0.00716205,0.00435998,0.00652709,0.971401,0.00413316,0.00641719
education,0.00201005,0.0112444,0.0104305,0.960422,0.0044671,0.0114257
education,0.00078977,0.00271799,0.00356573,0.989237,0.00164318,0.00204605
education,0.0127883,0.0398004,0.164989,0.647013,0.0677446,0.0676651
education,0.00138501,0.00279026,0.00863425,0.979759,0.00318674,0.00424458
education,0.00863156,0.017872,0.0480161,0.8976,0.0138661,0.0140138
education,0.00102662,0.00178066,0.00468632,0.98468,0.00400643,0.00381952
education,0.00143848,0.0041389,0.00559884,0.981511,0.00244877,0.00486394
administrative,0.100777,0.546673,0.143615,0.144304,0.0306575,0.0339735




In [63]:
###: H2O GBM Model performance on test data

In [64]:
h2o_gbm.model_performance(test_data=test)


ModelMetricsMultinomial: gbm
** Reported on test data. **

MSE: 0.175580515861
RMSE: 0.419023287969
LogLoss: 0.551114140641
Mean Per-Class Error: 0.199863730921
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
accounting,administrative,customerservice,education,foodbeverage,labor,Error,Rate
126.0,20.0,8.0,2.0,2.0,1.0,0.2075472,33 / 159
16.0,194.0,32.0,8.0,2.0,6.0,0.2480620,64 / 258
4.0,27.0,147.0,4.0,16.0,21.0,0.3287671,72 / 219
2.0,4.0,6.0,233.0,4.0,2.0,0.0717131,18 / 251
0.0,6.0,14.0,4.0,226.0,24.0,0.1751825,48 / 274
2.0,6.0,19.0,2.0,16.0,223.0,0.1679104,45 / 268
150.0,257.0,226.0,253.0,266.0,277.0,0.1959412,"280 / 1,429"


Top-6 Hit Ratios: 


0,1
k,hit_ratio
1,0.8040588
2,0.9251225
3,0.9678097
4,0.9846047
5,0.9958013
6,1.0




In [65]:
###: Creating a local function to perform prediction by passing a custom job description

In [66]:
def predict_with_h2o_w2v(job_title,w2v, gbm):
    words = tokenize(h2o.H2OFrame(job_title).ascharacter())
    job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=job_title_vec))

In [67]:
###: Calling the local prediction function to predict category of given job details

In [68]:
predict_local = predict_with_h2o_w2v(["food kitchen sandwich"], h2o_w2v_model, h2o_gbm)
print(predict_local)

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,accounting,administrative,customerservice,education,foodbeverage,labor
foodbeverage,0.000780452,0.001684,0.00915411,0.000943528,0.982074,0.00536387



None


In [69]:
predict_local = predict_with_h2o_w2v(["swimming pool digging in the backyard"], h2o_w2v_model, h2o_gbm)
print(predict_local)

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,accounting,administrative,customerservice,education,foodbeverage,labor
labor,0.00116283,0.00235045,0.0210939,0.00188025,0.0093569,0.964156



None


In [71]:
predict_local = predict_with_h2o_w2v(["programming in Javascript and web", "personal assistance to the doctor"], h2o_w2v_model, h2o_gbm)
print(predict_local)

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,accounting,administrative,customerservice,education,foodbeverage,labor
education,0.0592295,0.121967,0.157455,0.539883,0.0467595,0.0747068
administrative,0.0451113,0.835459,0.077937,0.00478149,0.0151103,0.0216009



None
