In [53]:
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np
import pandas as pd

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

from ast import literal_eval

In [54]:
import nltk

# Import and download stopwords from NLTK.
nltk.download('stopwords')  # Download stopwords list.
stopwords = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to C:\Users\Shivaani
[nltk_data]     K\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
def preprocess(doc):
    # Tokenize, clean up input document string
#     doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
#     doc = sub(r'<[^<>]+(>|$)', " ", doc)
#     doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
#     doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

In [56]:
import csv 

# Load test data
df = pd.read_csv('whitespce_based.csv')

titles = [item for item in df['company_name']]
#documents = [literal_eval(item) for item in df['tokenized text']]
desc = [item for item in df['cleaned_description']]
tokens = [preprocess(item) for item in df['cleaned_description']]
print(f'{len(tokens)} documents')

2996 documents


In [57]:
if 'glove' not in locals():  # only load if not already in memory
    glove = api.load("glove-wiki-gigaword-50")
    
similarity_index = WordEmbeddingSimilarityIndex(glove)

In [49]:
corpus = []
query_string = tokens[0]
documents = tokens[1:]

corpus = [document for document in documents]
query = query_string

In [51]:
documents


[['venture', 'capital'],
 ['live',
  'data',
  'platform',
  'ingest',
  'process',
  'respond',
  'operate',
  'live',
  'data',
  'system',
  'scale'],
 ['terawatts', 'clean', 'energy', 'software', 'company'],
 ['diq',
  'make',
  'beyond',
  'easy',
  'recruiter',
  'edit',
  'format',
  'get',
  'realtime',
  'feedback',
  'candidate',
  'submittals'],
 ['p', 'touch', 'medium', 'mobile', 'application', 'development', 'startup'],
 ['di',
  'data',
  'analytics',
  'platform',
  'architecture',
  'engineering',
  'construction',
  'industry'],
 ['abya',
  'developer',
  'provider',
  'technological',
  'component',
  'creation',
  'graphic',
  'virtualization',
  'cloudgaming',
  'solution'],
 ['accelitude',
  'llc',
  'company',
  'deal',
  'digital',
  'cloud',
  'service',
  'solution'],
 ['accelor',
  'provides',
  'hardware',
  'solution',
  'intended',
  'blockchain',
  'ai',
  'business'],
 ['accounting',
  'panel',
  'cloudbased',
  'accounting',
  'tool',
  'designed',
  'he

In [39]:
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)


# Create the term similarity matrix. 
# The nonzero_limit enforces sparsity by limiting the number of non-zero terms in each column. 
# For my application, I got best results by removing the default value of 100
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

In [40]:
# Compute Soft Cosine Measure between the query and the documents.
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf]


In [41]:
# Output the similarity scores for top 15 documents
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
print(f'Query: \t {0} \t {titles[0]} \t {desc[0]}')
for idx in sorted_indexes[:15]:
    print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {titles[idx]} \t {desc[idx]}')

Query: 	 0 	 1 Accord Technologies 	 1 accord technology providing professional support business strive give client enterpriselevel service
2050 	 0.545 	 Double Loop Games 	 double loop game mobile studio dedicated making delightful relaxing experience biggest audience game
2334 	 0.518 	 ninecoves 	 strive help business get reliable timely insight make informationdriven decision develop datadriven culture
422 	 0.510 	 Gatherade 	 gatherade mobile application local social networking organizational tool
438 	 0.506 	 Groopit 	 groopit collaboration platform collect data cross functional remote decentralized team accelerate collaboration
1937 	 0.501 	 BrewStubs 	 brewstubs™ enables craft brewery event organizer attract engage build ongoing relationship beer lover
2444 	 0.493 	 3rdDegree App 	 building connection one conversation time
734 	 0.477 	 Surefyre 	 bringing efficiency modern consumerstyle insuretech massive market insurer currently access
1537 	 0.476 	 Gitcoin 	 gitcoin pl

In [58]:
#considering each row as a query
for i in range(0,len(tokens)):
    corpus = []
    query_string = tokens[i]
    documents = tokens[0:i]+tokens[i+1:]
    

#     corpus = [document for document in documents]
    corpus = documents
    query = query_string
    
    dictionary = Dictionary(corpus+[query])
    tfidf = TfidfModel(dictionary=dictionary)
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)
    
    query_tf = tfidf[dictionary.doc2bow(query)]

    index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)
    doc_similarity_scores = index[query_tf]
    
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    print(f'Query: \t {0} \t {titles[i]} \t {desc[i]}')
    for idx in sorted_indexes[:15]:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {titles[idx]} \t {desc[idx]}')

Query: 	 0 	 1 Accord Technologies 	 1 accord technology providing professional support business strive give client enterpriselevel service
2050 	 0.545 	 Double Loop Games 	 double loop game mobile studio dedicated making delightful relaxing experience biggest audience game
2334 	 0.518 	 ninecoves 	 strive help business get reliable timely insight make informationdriven decision develop datadriven culture
422 	 0.510 	 Gatherade 	 gatherade mobile application local social networking organizational tool
438 	 0.506 	 Groopit 	 groopit collaboration platform collect data cross functional remote decentralized team accelerate collaboration
1937 	 0.501 	 BrewStubs 	 brewstubs™ enables craft brewery event organizer attract engage build ongoing relationship beer lover
2444 	 0.493 	 3rdDegree App 	 building connection one conversation time
734 	 0.477 	 Surefyre 	 bringing efficiency modern consumerstyle insuretech massive market insurer currently access
1537 	 0.476 	 Gitcoin 	 gitcoin pl

Query: 	 0 	 3P Touch Media 	 3p touch medium mobile application development startup
2482 	 0.708 	 Launch House 	 launch house community medium company top entrepreneur
2699 	 0.672 	 Paydrop 	 payment infrastructure gigeconomy future work
1448 	 0.651 	 Hexabu 	 hexabu software company develops salesforce appexchange apps event monitoring
2607 	 0.634 	 Sillo 	 sillo mobile application help student build confidence initial anonymous conversation
2783 	 0.634 	 Ryan Paonessa Design 	 ryan paonessa design provides web brand identity design development consulting
105 	 0.627 	 Bema Creative 	 bema creative delivers branding website creation animation logo design trade show social medium campaign service
50 	 0.605 	 Archeads 	 archeads real estate property management firm dealing floorplans service
2513 	 0.604 	 Monumental 	 monumental specializes development mobile game
873 	 0.603 	 Scire Strategies 	 scire strategy consultancy specialzies project management strategic planning perfor

Query: 	 0 	 ACE IoT Solutions 	 ace iot solution offer secure managed platform collecting aggregating visualizing facility operation
1235 	 0.452 	 Magna Marketing 	 magna marketing digital transformation consulting firm specializes implementation advisory service managed service
788 	 0.444 	 Speakeasy AI 	 speakeasy ai make easier business understand respond customers’ need voice ai
2223 	 0.426 	 Iteright 	 iteright help software product team create solution customer need guided datadriven validation activity
829 	 0.415 	 SimpleForms 	 simpleforms demystifies mandatory employment document turning question making easier exchange store understand
1270 	 0.414 	 Logicluster 	 logicluster aiml currency pricing software remittance payment industry
435 	 0.413 	 Givingli 	 mobile gifting modern consumer send digital greeting designed independent artist along egifts favorite brand
880 	 0.413 	 Santa.com 	 santacom  official site santa oh yes im real
785 	 0.412 	 Spectiv 	 spectiv virtu

Query: 	 0 	 ACW Analytics 	 deliver insight help asset manager plan react extreme weather event
1957 	 0.528 	 Candu 	 candu product experience platform software team want design refine personalize application’s user interface
2527 	 0.503 	 Oceana 	 oceana innovation process  technology platform support organization aspiring become innovative culture
2750 	 0.466 	 PWRFWD 	 pwrfwd online marketplace drive athlete consumer industry
581 	 0.464 	 Vertispan 	 vertispan specializes software development ecosystem support gwt compiler
575 	 0.461 	 Vicicit 	 vicicit cloud based candidate generation system cgs
1508 	 0.452 	 Functionaire 	 functionaire provides ux  ui design ux research ux strategy service
416 	 0.448 	 Foresite.ai 	 foresite saas platform providing predictive analytics geovisualization commercial real estate investor angelpad 12
1725 	 0.442 	 Conversive 	 conversive offer enterprise solution solving problem simulating facetoface conversation scale
1560 	 0.438 	  Fluentz,

Query: 	 0 	 Agape 	 agape platform scientifically designed strengthen romantic relationship personalized daily question
449 	 0.496 	 Harpoon 	 harpoon earlystage venture capital specializes finance investment management
245 	 0.482 	 Cora 	 datadriven saas platform  marketplace 77b b2b furnishing industry
758 	 0.481 	 Stir 	 stir set create product goal transforming way people go
346 	 0.475 	 Digital Capital Management 	 digital capital management fund management firm offering portfolio management digital asset fund plan different investor
681 	 0.435 	 Textile 	 textile building new digital experience focusing user privacy transparency decentralization
2970 	 0.433 	 Cofactr 	 cofactr using data new way link engineering assembly part supply chain circuit board manufacturing industry
2003 	 0.429 	 ConnectBooks 	 connectbooks amazon fba bookkeeping software provides integration profit dashboard amazon fba seller
1494 	 0.423 	 Guesst 	 guesst technology company power new retail
131

Query: 	 0 	 Agreemint 	 search faster close end
1793 	 0.861 	 3D Hybrid Solutions 	 3d hybrid solution develops metal 3d printing laser hardening tool cnc machine
2368 	 0.838 	 Pivot Market 	 marketplace allows brand book space inside store
1146 	 0.823 	 mymedicalimages.com 	 mymedicalimagescom place view share manage medical image
1902 	 0.777 	 Axle 	 axle help freight broker grow faster automating backoffice providing fast  affordable working capital
1490 	 0.777 	 GridRank 	 gridrank first platform enables fast intuitive group decision making turn every decision game colleague
2758 	 0.771 	 ReviewNPrep 	 reviewnprep edtech company help user prepare next certification practice exam marketplace
1243 	 0.771 	 Market Vulture 	 market vulture monitor dramatic event around world spot opportunity lie beneath
2581 	 0.771 	 ResumeZest 	 resume writing service
1563 	 0.768 	 Flux TXT 	 text marketing platform accelerate business
1411 	 0.727 	 iFieldSmart 	 ifieldsmart cloud based pro

Query: 	 0 	 anna 	 anna first aipowered selfcheckout solution designed meet specific need cannabis retailer
1456 	 0.462 	 HinBit 	 hinbit offer various service including education code migration linux consulting service various rt issue
1681 	 0.447 	 EatAgain 	 new innovative food ordering app integrates po system provides customer exclusive discount
1592 	 0.447 	 EXGwear 	 exgwear robotics company specializes field artificial intelligence internet thing
462 	 0.439 	 Hydrostasis 	 hydrostasis developer realtime hydration monitoring system
677 	 0.431 	 The CarShare Guy 	 redefining mobility content carsharing
2043 	 0.429 	 Diversity and Inclusion Learning Snippets 	 diversity inclusion learning snippet company build tool practice better elearning experience
691 	 0.429 	 Tenor.ai 	 tenor build digital medical assistant help clinician provide better care efficient make better decision  
1301 	 0.421 	 Leap Security 	 leap security provides information security penetration testing 

Query: 	 0 	 Aori 	 set tool help conquer google bing facebook ad
1699 	 0.710 	 De Novo 	 job matching app combine ai human wisdom help professional find best job fit interactive streamlined  delightful way
1652 	 0.660 	 Diode Ventures 	  diode venture provides turnkey endtoend solution industrial commercial technology interest
2852 	 0.660 	 ZoeZoe 	 fashion ecommerce
263 	 0.660 	 Coosta 	 coosta mortgage collateral transform platform secondary market
242 	 0.660 	 Conundrum Software 	 conundrum software company develops social networking apps software iphone
2696 	 0.660 	 ZoeZoe 	 fashion ecommerce
2914 	 0.651 	 Mountains 	 mountain online marketplace allows artist creator get feedback work form experienced professional
2283 	 0.613 	 Logistics For Hire 	 logistics hire developer distribution management software
2546 	 0.613 	 Papercups 	 papercups specializes offering reliable efficient software allows company connect user directly realtime chat
2051 	 0.574 	 DoubleGDP 	 doubl

KeyboardInterrupt: 

In [43]:
print(documents)

[['accord', 'technology', 'providing', 'professional', 'support', 'business', 'strive', 'give', 'client', 'enterpriselevel', 'service'], ['venture', 'capital'], ['live', 'data', 'platform', 'ingest', 'process', 'respond', 'operate', 'live', 'data', 'system', 'scale'], ['terawatts', 'clean', 'energy', 'software', 'company'], ['diq', 'make', 'beyond', 'easy', 'recruiter', 'edit', 'format', 'get', 'realtime', 'feedback', 'candidate', 'submittals'], ['p', 'touch', 'medium', 'mobile', 'application', 'development', 'startup'], ['di', 'data', 'analytics', 'platform', 'architecture', 'engineering', 'construction', 'industry'], ['abya', 'developer', 'provider', 'technological', 'component', 'creation', 'graphic', 'virtualization', 'cloudgaming', 'solution'], ['accelitude', 'llc', 'company', 'deal', 'digital', 'cloud', 'service', 'solution'], ['accelor', 'provides', 'hardware', 'solution', 'intended', 'blockchain', 'ai', 'business'], ['accounting', 'panel', 'cloudbased', 'accounting', 'tool', 'd

In [48]:
corpus

[[['venture', 'capital'],
  ['live',
   'data',
   'platform',
   'ingest',
   'process',
   'respond',
   'operate',
   'live',
   'data',
   'system',
   'scale'],
  ['terawatts', 'clean', 'energy', 'software', 'company'],
  ['diq',
   'make',
   'beyond',
   'easy',
   'recruiter',
   'edit',
   'format',
   'get',
   'realtime',
   'feedback',
   'candidate',
   'submittals'],
  ['p', 'touch', 'medium', 'mobile', 'application', 'development', 'startup'],
  ['di',
   'data',
   'analytics',
   'platform',
   'architecture',
   'engineering',
   'construction',
   'industry'],
  ['abya',
   'developer',
   'provider',
   'technological',
   'component',
   'creation',
   'graphic',
   'virtualization',
   'cloudgaming',
   'solution'],
  ['accelitude',
   'llc',
   'company',
   'deal',
   'digital',
   'cloud',
   'service',
   'solution'],
  ['accelor',
   'provides',
   'hardware',
   'solution',
   'intended',
   'blockchain',
   'ai',
   'business'],
  ['accounting',
   'panel',