In [7]:
from bs4 import BeautifulSoup
import requests
import multiprocessing
import numpy as np
import pandas as pd
from gensim import models
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Phrases
from gensim.models.fasttext import FastText
from gensim.models.wrappers.fasttext import FastText as FT_wrapper
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import time
import re
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook

In [4]:
w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
# w2v = KeyedVectors.load("../w2v/model/fasttext_w2v_vector_64")

In [46]:
def get_urls_from_url(main_url):
    resp = requests.get(main_url)
    soup = BeautifulSoup(resp.content, 'html.parser')
    urls = []
    links = soup.find_all('a')
    for url in links:
        try:
            url = url.attrs['href']
            if len(url) > 5:
                urls.append(url)
        except:
            pass
    return urls

def get_texts_from_resp(resp):
    # parse the web response
    soup = BeautifulSoup(resp.content, 'html.parser')
    # find and filter texts
    print("These are texts under", resp.url)
    texts = soup.find_all('p')
    print("number of items grabed are", len(texts))
    texts = [text for text in texts if len(text.text) > 100]
    print("number of items after filtering", len(texts))
    # output texts
    for text in texts:
        #print(text.text)
        yield text.text

def url_is_valid(url):
    try:
        resp = requests.get(url)
        assert resp.status_code == 200
        return resp
    except:
        return False

def url_compare(url_target, url_income):
    n_same_letter = 0.0
    # delete all http or https 
    if url_target[4] == 's':
        url_target = url_target[5:]
    else:
        url_target = url_target[4:]
    if url_income[4] == 's':
        url_income = url_income[5:]
    else:
        url_income = url_income[4:]
    # check similarity
    min_len = min(len(url_target), len(url_income))
    for i in range(min_len-1):
        if url_target[i] == url_income[i]:
            n_same_letter += 1
        else:
            break
    return n_same_letter

def get_text_from_url_with_check(url, main_url):
    resp = url_is_valid(url)
    if not resp:
        url = main_url + url
        resp = url_is_valid(url)
        if not resp:
            print("url:", url, "invalid")
            return []
    # double check if the url is visited
    if resp.url != url: # meaning its redirected
        print('the url is redirected, try https\n')
        # try https
        url = url[:4] + 's' + url[4:]
        resp = url_is_valid(url)
        if resp:
            if resp.url == url:
                print('try succeeded')
        else:
            return []
    # check if url is the child or sibling of main_url
    if url_compare(main_url, resp.url) < 10: # to avoid http://www.
        print('\nurl:', resp.url, 'might be irrelevent to', main_url, 'quit visiting\n')
        return []
    text_data = []
    for text in get_texts_from_resp(resp):
        text_data.append(text)
    return text_data

In [48]:
class Searcher():
    def __init__(self, w2v=None, database=None):
        # load w2v modle
        if w2v is None:
            print("start loading w2v, this might take a while")
            self._w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
        else:
            self._w2v = w2v
        
        # get and process database
        if database is None:
            # if no crawled database given
            # load the dataset : including only each company's name, url and summary
            self._database = pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, [1, 5, 6]]
            self.crawl_database()
        else:
            self._database = database
        self.process_database()
        
    def process_database(self):
        # 1: company name, 5: company website, 6: company manual desc
        raw_texts = []
        # preprocess all the text data and remove any row without any useful data, and segment each word
        drop_list = []
        for row in self._database.itertuples():
            if not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str):
                # check if the row has data
                drop_list.append(row[0])
            else:
                # process text data of both manually summarized or crawled data
                tmp_text = []
                for col in [2, 3]:
                    real_col = col - 1
                    text = row[col]
                    if type(text) is str:
                        text = self.word_tokenize_string(text)
                        self._database.iloc[row[0], real_col] = text
                        tmp_text.append(text)
                # merge texts of same company
                tmp_text = '    '.join(tmp_text)
                raw_texts.append(tmp_text)
                
        # drop all the rows that do not have essential data
        self._database.drop(drop_list, inplace=True)
        # create similarity col for similarity search use
        self._database = self._database.assign(similarity=np.zeros(len(self._database)))
        
        # use the raw_texts to generate tfidf model
        self._tfidf, self._dictionary = self.get_tfidf_and_dictionary(raw_texts)
        
    def crawl_database(self):
        for row in self._database.itertuples():
            if not (not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str)):
                # process each website and replace web address with texts crawled
                url = row[2]
                texts = self.get_text_from_url_and_its_children(url)
                if not texts:
                    # if cannot access url, replace url with Nan
                    self._database.iloc[row[0], 1] = np.nan
                else:
                    # replace the url with the crawled texts
                    texts = '   '.join(texts)
                    self._database.iloc[row[0], 1] = texts
        
    def save_database(self):
        self._database.to_csv('crawled_database.csv')
        print("database save successful")
    
    def update_similarity(self, input_text, col=1):
        # get input text vector
        input_text_vector = self._get_doc_vector(input_text)
        i = 0
        for row in database.itertuples():
            row_text_vector = self._get_doc_vector(row[col+1])
            similarity = input_text_vector.dot(row_text_vector)
            database.iloc[i, -1] = similarity
            i += 1
        self._database = self._database.sort_values(by='similarity', ascending=False)
        return database

    def get_doc_vector(self, text):
        tokens = list(self._dictionary.token2id)
        # convert any unknown word to known word
        new_text = []
        for word in text.split():
            if word in tokens:
                new_text.append(word)
            elif word in w2v: # replace the unknow word with the most similar word in tokens of dictionary
                new_text.append(self._w2v.most_similar_to_given(word_list=tokens, w1=word))

        # start to calculate vector using tfidf weighted word vector sum
        # get tfidf weight
        tokenized_text = [self._dictionary.doc2bow(new_text)]
        tfidf_text = self._tfidf[tokenized_text][0]
        # sum weighted word vectors
        sum_vector = self._w2v['happy'] * 0 # get the size of the word vector
        for word_id, weight in tfidf_text:
            word = self._dictionary[word_id]
            sum_vector += self._w2v[word] * weight
        sum_vector /= np.sqrt(sum_vector.dot(sum_vector)) # normalize the vector

        return sum_vector
    
    def word_tokenize_string(self, text):
        stop_words = get_stop_words('en') # get too frequent word
        text = text.replace('\r', ' ').replace('\n', ' ') # remove symbols
        text = re.sub(r"http\S+", "", text) # remove urls
        # remove any word that present too frequently or cannot be converted to word vector
        text = [word for sent in sent_tokenize(text.lower()) for word in word_tokenize(sent) \
                if not word in stop_words and word in self._w2v]
        return ' '.join(text)
    
    @staticmethod
    def get_tfidf_and_dictionary(texts):
        # get dictionary of texts
        texts = [text.split() for text in texts]
        dictionary = corpora.Dictionary(texts)

        # get tfidf ranking model
        tokenized_texts = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(tokenized_texts)

        return tfidf, dictionary
    
    def get_text_from_url_and_its_children(self, main_url):
        print("starting to crawl main url: ", main_url)
        # check validity of main_url
        resp = url_is_valid(main_url)
        if not resp:
            print("main_url is not valid")
            return False

        print("\nstarting to crawl all its children")
        # grab all urls in this web page
        urls = [main_url]
        urls.extend(get_urls_from_url(main_url))
        urls = list(set(urls)) # remove duplicated urls
        print("\n\nthese are the children links we crawled")
        print(urls, "\n")
        # grab all texts in each urls asynchronously
        # argmumentize urls
        urls = [(url, main_url) for url in urls]
        with multiprocessing.Pool(processes=24) as pool:
            text_data = pool.starmap(get_text_from_url_with_check, urls) 
            # try terminating hung jobs
        text_data = [text for text in text_data if len(text_data) > 0] # remove empty returns
        text_data = [text for text_list in text_data for text in text_list] # get list elements to str
        return text_data

In [None]:
searcher = Searcher(w2v=w2v)
# there might be main_url + url is not valid, because url and main_url has overlaps, or main_url is not the root
# must provide root url, or 
# use overlaps to do intelligent main_url + url

starting to crawl main url:  http://www.aecfafrica.org/

starting to crawl all its children


these are the children links we crawled
['/contact-us', '/contact', '/the-aecf-board', '/about-us/who-we-are', '/portfolio/renewable-energy', '/about-us/our-history', '/about-us/strategic-partners', '/portfolio/aecf-connect', 'tel:+254703033394', '\n\n\n\n\n/portfolio/renewable-energy\n\n\n', '/about-us/funding-partners', '/portfolio/competitions', '/portfolio/agribusiness', '/about-us/work-with-us', 'http://www.aecfafrica.org/', '/media-centre/videos', 'http://www.parioagency.com', '/news/sida_signs_five_year_agreement_with_the_AECF', '\n\n\n\n\n/portfolio/agribusiness\n\n\n', 'https://www.facebook.com/AECFAfrica/', '\n\n\n\n\n/about-us/who-we-are\n\n\n', '/knowledge-hub', 'https://www.linkedin.com/company/africa-enterprise-challenge-fund', 'mailto:info@aecfafrica.org', '/portfolio/overview', '/media-centre/blog', '/impact', '\n\n\n\n\n/about-us/funding-partners\n\n\n', 'https://twitter.com/A

In [30]:
url = 'http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments'
searcher.get_text_from_url_and_its_children(url)

starting to crawl main url:  http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments

starting to crawl all its children


these are the children links we crawled
['/math/matrix-vector-array-numpy', '/math/statistics', 'https://docs.python.org/3/library/multiprocessing.html', '/home/help', '/www/download-webpage', '/strings', '/argparse', '/plot/matplotlib', '/loops/if-else', '/math/types', '/debugging', '/home/download-install', '/error-handling/raise-type-checking', '/multiprocessing_map', '/packages', 'http://sites.google.com', '/sytem/environment', '/error/taberror', '/loops/list-comprehension', '/print-write-format/error', '/www/cgi-script', '/data-structures/dictionary', '/sytem', '/file-operations/files-read-write', '/packages/create-modules', '/packages/python-script', 'https://docs.python.org/3.4/library/functools.html#functools.partial', '/strings/safe-characters', 'http://python.omics.wiki/', '/math/dataframe', '/system/app/pages/rece

Process ForkPoolWorker-191:
Process ForkPoolWorker-190:
Process ForkPoolWorker-192:
Process ForkPoolWorker-194:
Process ForkPoolWorker-180:
Process ForkPoolWorker-183:
Process ForkPoolWorker-193:
Process ForkPoolWorker-196:
Process ForkPoolWorker-181:
Process ForkPoolWorker-195:
Process ForkPoolWorker-179:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-182:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  F

KeyboardInterrupt
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
Process ForkPoolWorker-175:
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/synchronize.py", line 9

  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
KeyboardInterrupt
KeyboardInterrupt
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/queues.py", line 334, in get
    with self._rlock:
KeyboardInterrupt
  File "/home/yuze/anaconda3/envs/tensorflow/lib

KeyboardInterrupt: 

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt
Process ForkPoolWorker-178:
Traceback (most recent call last):
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/queues.py", line 337, in get
    return ForkingPickler.loads(res)
KeyboardInterrupt


KeyboardInterrupt: 

In [11]:
def rescue_code(function):
    import inspect
    get_ipython().set_next_input("".join(inspect.getsourcelines(function)[0]))

In [16]:
input_text = "new start up aiming at low income customers, dedicated in green energy"

In [17]:
search_output = update_similarity(w2v, dictionary, tfidf, input_text, database)
list(search_output.iloc[1, :])

['Fund for Developing',
 '. invests enterprises low income countries promote business development contribute economic growth poverty alleviation . ’ s geographic focus eastern southern africa , well selected countries asia central america . focuses supporting small medium sized companies .',
 nan,
 0.87590879201889038]

In [44]:
vec1 = get_doc_vector('a startup that dedicate to green energy', w2v, dictionary, tfidf)
vec2 = get_doc_vector('business regrading green energy', w2v, dictionary, tfidf)
vec3 = get_doc_vector('companies specificly support low-income people', w2v, dictionary, tfidf)

In [46]:
vec1.dot(vec2)

0.83994347

In [11]:
vec1

array([-0.01663876, -0.00337446, -0.02395986, -0.04154515, -0.10667257,
        0.01662513, -0.01009543,  0.06161033,  0.05003018, -0.06181473,
       -0.02179295, -0.01145902, -0.01362746, -0.03405415,  0.00218448,
        0.00767476, -0.01452028, -0.0018257 ,  0.07659619, -0.04767921,
       -0.04837132, -0.02310556, -0.05546847, -0.00884663, -0.01575219,
        0.00686825, -0.02110741,  0.02846674, -0.00410359,  0.0267566 ,
        0.0219034 ,  0.01259031, -0.00153076,  0.0267643 ,  0.04200932,
       -0.05939888,  0.02810077, -0.01088257,  0.02167377,  0.05022477,
        0.00617732, -0.01705034,  0.04542316,  0.08962091, -0.01164754,
        0.0392498 , -0.02558717, -0.0163672 ,  0.06803226, -0.01166763,
        0.01418332,  0.04633227, -0.00154001,  0.00630221, -0.05789408,
        0.02684514,  0.03213207,  0.02908244, -0.05910822,  0.01098826,
       -0.04702628, -0.01257465,  0.02256784, -0.01471028], dtype=float32)

In [12]:
vec2

array([-0.01663876, -0.00337446, -0.02395986, -0.04154515, -0.10667257,
        0.01662513, -0.01009543,  0.06161033,  0.05003018, -0.06181473,
       -0.02179295, -0.01145902, -0.01362746, -0.03405415,  0.00218448,
        0.00767476, -0.01452028, -0.0018257 ,  0.07659619, -0.04767921,
       -0.04837132, -0.02310556, -0.05546847, -0.00884663, -0.01575219,
        0.00686825, -0.02110741,  0.02846674, -0.00410359,  0.0267566 ,
        0.0219034 ,  0.01259031, -0.00153076,  0.0267643 ,  0.04200932,
       -0.05939888,  0.02810077, -0.01088257,  0.02167377,  0.05022477,
        0.00617732, -0.01705034,  0.04542316,  0.08962091, -0.01164754,
        0.0392498 , -0.02558717, -0.0163672 ,  0.06803226, -0.01166763,
        0.01418332,  0.04633227, -0.00154001,  0.00630221, -0.05789408,
        0.02684514,  0.03213207,  0.02908244, -0.05910822,  0.01098826,
       -0.04702628, -0.01257465,  0.02256784, -0.01471028], dtype=float32)