In [3]:
import urllib
import sqlite3
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# Create a list of words to ignore 
ignorewords=set(['the','of','to','and','a','in','is','it'])

class crawler :
    def __init__(self,dbname):
        self.con=sqlite3.connect(dbname)
        
    def __del__(self):
        self.con.close()
        
    def dbcommit(self):
        self.con.commit()
    
    #help-func to get an identifier  and add an item, if its not
    def getentryid(self,table,field,value,createnew=True):
        cur=self.con.execute(
        "select rowid from %s where %s='%s'" % (table,field,value))
        res=cur.fetchone( )
        if res==None:
            cur=self.con.execute(
            "insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    
    #indexing page
    def addtoindex(self,url,soup):
        if self.isindexed(url): return
        print ('Indexing ', url)
        # Get the individual words
        text=self.gettextonly(soup)
        words=self.separatewords(text)
        # Get the URL id
        urlid=self.getentryid('urllist','url',url)
        # Link each word to this url
        for i in range(len(words)):
            word=words[i]
            if word in ignorewords: continue
            wordid=self.getentryid('wordlist','word',word)
            self.con.execute("insert into wordlocation(urlid,wordid,location) \
                values (%d,%d,%d)" % (urlid,wordid,i))
    
    #retrieving text from a page(without <tag>)
    def gettextonly(self,soup):
        v=soup.string
        if v==None:
            c=soup.contents
            resulttext=''
            for t in c:
                subtext=self.gettextonly(t)
                resulttext+=subtext+'\n'
            return resulttext
        else:
            if (soup.name == 'script') or (soup.name == 'style')  :
                return '\n'
            return v.strip( )
    
    #splitting text into words
    def separatewords(self,text):
        splitter=re.compile('\\W*')
        return [s.lower( ) for s in splitter.split(text) if s!='']
    
    #return true if url is indexed
    def isindexed(self,url):
        u=self.con.execute \
            ("select rowid from urllist where url='%s'" % url).fetchone( )
        if u!=None:
            # Check if it has actually been crawled
            v=self.con.execute(
            'select * from wordlocation where urlid=%d' % u[0]).fetchone( )
            if v!=None: return True
        return False
    
    # Add a link between two pages
    def addlinkref(self,urlFrom,urlTo,linkText):
        pass
    
    # Starting with a list of pages, indexing pages
    def crawl(self,pages,depth=2):
        for i in range(depth):
            newpages=set( )
            for page in pages:
                try:
                    c=urllib.request.urlopen(page)
                except:
                    print("Could not open " , page)
                    continue
                soup=BeautifulSoup(c.read( ))
                self.addtoindex(page,soup.body)
                links=soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url=urljoin(page,link['href'])
                        if url.find("'")!=-1: continue
                        url=url.split('#')[0] # remove location portion
                        if url[0:4]=='http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText=self.gettextonly(link)
                        self.addlinkref(page,url,linkText)
                self.dbcommit( )
            pages=newpages
    
    # creating the database tables
    def createindextables(self):
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linkid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit( )

        

class searcher:
    def __init__(self,dbname):
        self.con=sqlite3.connect(dbname)
        
    def __del__(self):
        self.con.close( )
    
    def getmatchrows(self,q):
        # Strings to build the query
        fieldlist='w0.urlid'
        tablelist=''
        clauselist=''
        wordids=[]
        # Split the words by spaces
        words=q.split(' ')
        tablenumber=0
        for word in words:
            # Get the word ID
            wordrow=self.con.execute( "select rowid from wordlist where word='%s'" % word).fetchone( )
            if wordrow!=None:
                wordid=wordrow[0]
                wordids.append(wordid)
                if tablenumber>0:
                    tablelist+=','
                    clauselist+=' and '
                    clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)
                fieldlist+=',w%d.location' % tablenumber
                tablelist+='wordlocation w%d' % tablenumber
                clauselist+='w%d.wordid=%d' % (tablenumber,wordid)
                tablenumber+=1
        # Create the query from the separate parts
        fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
        cur=self.con.execute(fullquery)
        rows=[row for row in cur]
        return rows,wordids
    
    def getscoredlist(self,rows,wordids):
        totalscores=dict([(row[0],0) for row in rows])
        # This is where you'll later put the scoring functions
        weights=[(1.0,self.frequencyscore(rows)),
                (1.5,self.locationscore(rows)),
                (1.0,self.distancescore(rows))
                ]
        for (weight,scores) in weights:
            for url in totalscores:
                totalscores[url]+=weight*scores[url]
                
        return totalscores
    def geturlname(self,id):
        return self.con.execute(
        "select url from urllist where rowid=%d" % id).fetchone( )[0]
    
    def query(self,q):
        rows,wordids=self.getmatchrows(q)
        scores=self.getscoredlist(rows,wordids)
        rankedscores=sorted([(score,url) for (url,score) in scores.items( )],reverse=1)
        for (score,urlid) in rankedscores[0:10]:
            print('%f\t%s' , (score,self.geturlname(urlid)))
            
    def normalizescores(self,scores,smallIsBetter=0):
        vsmall=0.00001 # Avoid division by zero errors
        if smallIsBetter:
            minscore=min(scores.values( ))
            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)  in scores.items( )])
        else:
            maxscore=max(scores.values( ))
            if maxscore==0: maxscore=vsmall
            return dict([(u,float(c)/maxscore) for (u,c) in scores.items( )])
    
    def frequencyscore(self,rows):
        counts=dict([(row[0],0) for row in rows])
        for row in rows: counts[row[0]]+=1
        return self.normalizescores(counts)

    def locationscore(self,rows):
        locations=dict([(row[0],1000000) for row in rows])
        for row in rows:
            loc=sum(row[1:])
            if loc<locations[row[0]]: locations[row[0]]=loc
        return self.normalizescores(locations,smallIsBetter=1)
    
    def distancescore(self,rows):
        # If there's only one word, everyone wins!
        if len(rows[0])<=2: return dict([(row[0],1.0) for row in rows])
        # Initialize the dictionary with large values
        mindistance=dict([(row[0],1000000) for row in rows])
        for row in rows:
            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
            if dist<mindistance[row[0]]: mindistance[row[0]]=dist
        return self.normalizescores(mindistance,smallIsBetter=1)

In [10]:
eq = search.con.execute("select * from wordlist ").fetchall()

In [11]:
eq

[('react',),
 ('empty',),
 ('2',),
 ('toggle',),
 ('navigation',),
 ('open',),
 ('closed',),
 ('catalog',),
 ('browse',),
 ('search',),
 ('for',),
 ('enterprise',),
 ('log',),
 ('sign',),
 ('up',),
 ('46',),
 ('take',),
 ('world',),
 ('s',),
 ('best',),
 ('courses',),
 ('online',),
 ('join',),
 ('free',),
 ('see',),
 ('all',),
 ('credentials',),
 ('that',),
 ('count',),
 ('earn',),
 ('degrees',),
 ('certificates',),
 ('from',),
 ('top',),
 ('universities',),
 ('on',),
 ('coursera',),
 ('learn',),
 ('more',),
 ('specializations',),
 ('deeplearning',),
 ('ai',),
 ('deep',),
 ('learning',),
 ('5',),
 ('view',),
 ('specialization',),
 ('google',),
 ('support',),
 ('professional',),
 ('certificate',),
 ('6',),
 ('university',),
 ('michigan',),
 ('python',),
 ('everybody',),
 ('johns',),
 ('hopkins',),
 ('data',),
 ('science',),
 ('10',),
 ('applied',),
 ('with',),
 ('illinois',),
 ('at',),
 ('urbana',),
 ('champaign',),
 ('strategic',),
 ('leadership',),
 ('management',),
 ('7',),
 ('imperi

In [4]:
crawler = crawler('searchindex.db')


In [6]:
crawler.crawl(['https://www.coursera.org/'])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Indexing  https://www.coursera.org/
Indexing  https://www.coursera.org/learn/game-theory-1
Indexing  https://www.coursera.org/?authMode=signup
Indexing  https://www.coursera.org/?authMode=login
Indexing  https://www.coursera.org/learn/convolutional-neural-networks
Indexing  https://www.coursera.org/learn/pap
Indexing  https://www.coursera.org/specializations/digital-marketing
Indexing  https://www.coursera.org/learn/finanzas-personales
Indexing  https://www.coursera.org/learn/sciwrite
Indexing  https://learner.coursera.help/hc
Indexing  https://www.coursera.org/specializations/tesol
Indexing  https://www.coursera.org/learn/learn-to-program
Indexing  https://www.coursera.org/specializations/gcp-data-machine-learning
Indexing  https://www.coursera.org/learn/introduction-psych
Indexing  https://www.coursera.org/specializations/managerial-economics-business-analysis
Indexing  https://www.coursera.org/learn/bcg-uva-darden-digital-transformation
Indexing  https://www.coursera.org/about/priva

In [9]:
search = searcher('searchindex.db')

In [12]:
search.query('functional programming')

%f	%s (3.5, 'https://www.coursera.org/learn/programming-languages')
%f	%s (3.0439383482861744, 'https://www.coursera.org/learn/programming-languages-part-b')
%f	%s (1.3685406963850077, 'https://www.coursera.org/featured/top_specializations_locale_en_os_web')
%f	%s (1.3587831688020724, 'https://www.coursera.org/courses')
%f	%s (0.3812795561657473, 'https://www.coursera.org/learn/python-programming-introduction')
