In [14]:
from urllib import request 
import string

class WebWordsFrequency:
    def __init__(self, *l):
        self.urlList = []
        self.words = {}
        for url in l:
            self.addUrl(url)
    
    def addUrl(self, url):
        self.getWords(url)
        self.urlList.append(url)
        
    def removeUrl(self, url):
        if url in self.urlList:
            del self.words[url]
            self.urlList.remove(url)
    
    def listUrls(self):
        for url in self.urlList:
            print (url)
    
    def getWords(self, url):
        self.words[url] = {}
        source = request.urlopen(url).read().decode('utf-8')
        text = ''
        
        while ('<!--' in source):
            # 주석은 <!-- 과 --> 의 쌍으로 나타내어지므로 index 함수를 사용해 주석을 찾아 없앤다
            left = source.index('<!--')
            right = source.index('-->')
            source = source[:left] + source[right+3:]
        
        # Html Entities 제거
        htmlEntities = ['&nbsp;','&quot;','&lt;','&gt;','&amp;']
        for entity in htmlEntities:
            while (entity in source):
                left = source.index(entity)
                right = source.index(entity)
                source = source[:left] + source[right+len(entity):]    
        
        # <script> 태그가 없어질때까지 반복 (javascript 태그)
        while ('<script' in source):
        # <script> 태그 뒤에는 문장이 올 수 있으므로 <script 으로 검사해준다 
        # <script> ~ </script> 사이에 오는 문장은 순수 텍스트가 아니므로 무시한다
            left = source.index('<script')
            right = source.index('</script>')
            source = source[:left] + source[right+9:]
    
        # <style> 태그가 없어질때까지 반복 (css 태그)
        while ('<style' in source):
        # <style> 태그 뒤에는 문장이 올 수 있으므로 <style 으로 검사해준다 
        # <style> ~ </style> 사이에 오는 문장은 순수 텍스트가 아니므로 무시한다
            left = source.index('<style')
            right = source.index('</style>')
            source = source[:left] + source[right+8:] 
    
        # 태그는 < 과 > 의 쌍으로 나타내어지므로 index 함수를 사용해 태그 부분을 찾아 없앤다
        while('<' in source):
            left = source.index('<')
            right = source.index('>')
            text = text + ' ' + source[:left]
            source = source[right+1:]
            
        # 구두 문자 추가
        puncList = list(string.punctuation)
        puncList.extend(['›','’','‘','···','【','】','…','·','»','¢','£','¥','€','®','ⓒ','©']) 
        stopword = "a,about,above,across,after,again,against,all,almost,alone,along,already,also,although,always,among,an,and,another,any,anybody,anyone,anything,anywhere,are,area,areas,around,as,ask,asked,asking,asks,at,away,b,back,backed,backing,backs,be,became,because,become,becomes,been,before,began,behind,being,beings,best,better,between,big,both,but,by,c,came,can,cannot,case,cases,certain,certainly,clear,clearly,come,could,d,did,differ,different,differently,do,does,done,down,down,downed,downing,downs,during,e,each,early,either,end,ended,ending,ends,enough,even,evenly,ever,every,everybody,everyone,everything,everywhere,f,face,faces,fact,facts,far,felt,few,find,finds,first,for,four,from,full,fully,further,furthered,furthering,furthers,g,gave,general,generally,get,gets,give,given,gives,go,going,good,goods,got,great,greater,greatest,group,grouped,grouping,groups,h,had,has,have,having,he,her,here,herself,high,high,high,higher,highest,him,himself,his,how,however,i,if,important,in,interest,interested,interesting,interests,into,is,it,its,itself,j,just,k,keep,keeps,kind,knew,know,known,knows,l,large,largely,last,later,latest,least,less,let,lets,like,likely,long,longer,longest,m,made,make,making,man,many,may,me,member,members,men,might,more,most,mostly,mr,mrs,much,must,my,myself,n,necessary,need,needed,needing,needs,never,new,new,newer,newest,next,no,nobody,non,noone,not,nothing,now,nowhere,number,numbers,o,of,off,often,old,older,oldest,on,once,one,only,open,opened,opening,opens,or,order,ordered,ordering,orders,other,others,our,out,over,p,part,parted,parting,parts,per,perhaps,place,places,point,pointed,pointing,points,possible,present,presented,presenting,presents,problem,problems,put,puts,q,quite,r,rather,really,right,right,room,rooms,s,said,same,saw,say,says,second,seconds,see,seem,seemed,seeming,seems,sees,several,shall,she,should,show,showed,showing,shows,side,sides,since,small,smaller,smallest,so,some,somebody,someone,something,somewhere,state,states,still,still,such,sure,t,take,taken,than,that,the,their,them,then,there,therefore,these,they,thing,things,think,thinks,this,those,though,thought,thoughts,three,through,thus,to,today,together,too,took,toward,turn,turned,turning,turns,two,u,under,until,up,upon,us,use,used,uses,v,very,w,want,wanted,wanting,wants,was,way,ways,we,well,wells,went,were,what,when,where,whether,which,while,who,whole,whose,why,will,with,within,without,work,worked,working,works,would,x,y,year,years,yet,you,young,younger,youngest,your,yours".split(',')
    
        for t in text.split(): # 현재 text에 들어 있는 문장은 순수 텍스트이므로 공백을 기준으로 자름
            for char in puncList: 
                t = t.replace(char, '')
            t = t.lower() # 모든 단어를 대소문자를 구분하지 않고 단어를 저장함
            if len(t) > 0 and not t in stopword:
                if not t in self.words[url].keys():
                    self.words[url][t] = 1
                else:
                    self.words[url][t] += 1

    
    def getWordsFrequency(self):
        if len(self.urlList) == 0:
            return {}
        
        totalWords = {} 
        for url, word in self.words.items():
            for tmp in word.keys():
                if not tmp in totalWords.keys():
                    totalWords[tmp] = word[tmp]
                else:
                    totalWords[tmp] += word[tmp]
        
        return totalWords
    
    
    def getMaxFreqencyWords(self):
        if len(self.urlList) == 0:
            return None
        
        totalWords = self.getWordsFrequency()
        tmp = []
        maxv = 0
        for word, cnt in totalWords.items():
            if cnt > maxv:
                maxv = cnt
            
        for word, cnt in totalWords.items():
            if cnt == maxv:
                tmp.append(word)
        return tmp
    
    
    def searchUrlByWord(self, keyword):
        temp = []
        similarity = {}
        print("\n")
        print ("[ KEYWORD : %s ]" % (keyword))
        print("--------------------------------------\n")
        for i in range(len(self.urlList)):
            
           # print(self.words[self.urlList[i]])
            temp = list(self.words[self.urlList[i]].keys())
            if keyword in temp:
                similarity[self.urlList[i]] = 10 # 기본 유사도를 10으로 줌
            
                print ("URL : " + self.urlList[i])
                print ("키워드 %s 가 포함된 단어 항목입니다\n" % (keyword))
                
                for j in range(len(temp)):
                    if temp[j].find(keyword) != -1 and temp[j] != keyword:
                        print("[word] %s : [similarity] %d" % (temp[j], self.words[self.urlList[i]][temp[j]]))
                        similarity[self.urlList[i]] += self.words[self.urlList[i]][temp[j]]
                    # 입력받은 키워드가 단어 일부일 경우, 해당 단어의 출현 빈도를 더해줌
        
                    elif temp[j] == keyword:
                        print("[word] %s : [similarity] %d" % (temp[j], self.words[self.urlList[i]][temp[j]] + 20))
                        similarity[self.urlList[i]] += (self.words[self.urlList[i]][temp[j]] + 20) 
                    # 입력받은 키워드가 단어와 같을 경우에는, 해당 단어의 출현 빈도에 보너스 점수 20을 더한 것을 더해줌
                print("--------------------------------------")
            else:
                print ("URL : " + self.urlList[i])
                print ("키워드 %s 가 존재하지 않습니다\n" % (keyword))
                print("--------------------------------------")    
            
        
        maxv = 0
        l = []
        for url, count in similarity.items():
            if count > maxv:
                maxv = count;
        # 리스트에 저장
        for url, count in similarity.items():
            if count == maxv:
                l.append(url)
                
        print("=>" , end = ' ')
        return l

In [15]:
# 인스턴스 생성
w1 = WebWordsFrequency('http://www.cnn.com', 'http://www.times.com', 'http://www.apple.com/')
w2 = WebWordsFrequency('http://www.cnn.com', 'http://www.times.com')
w3 = WebWordsFrequency()

# add 테스트
w1.addUrl('https://github.com')
w3.addUrl('http://stackoverflow.com')

# remove 테스트
w1.removeUrl('http://www.cnn.com')
w2.removeUrl('http://stackoverflow.com')

# listUrls 테스트
w1.listUrls()
print ()
w2.listUrls()
print ()
w3.listUrls()
print ()
print ()

# getWordsFrequency 테스트
print (w1.getWordsFrequency())
print ()
print ()
print (w2.getWordsFrequency())
print ()
print ()
print (w3.getWordsFrequency())
print ()

# getMaxFreqencyWords 테스트
print (w1.getMaxFreqencyWords())
print ()
print (w2.getMaxFreqencyWords())
print ()
print (w3.getMaxFreqencyWords())
print ()

# searchUrlByWord 테스트
print (w1.searchUrlByWord("news"))
print ()
print (w1.searchUrlByWord('apple'))

http://www.times.com
http://www.apple.com/
https://github.com

http://www.cnn.com
http://www.times.com

http://stackoverflow.com


{'york': 8, 'times': 7, 'breaking': 1, 'news': 4, 'world': 9, 'videos': 1, 'continue': 2, 'reading': 2, 'main': 4, 'story': 2, 'sections': 1, 'search': 6, 'skip': 3, 'content': 2, 'site': 6, 'index': 2, 'international': 2, 'canada': 4, 'español': 1, '中文': 1, 'log': 1, 'todays': 1, 'paper': 1, 'politics': 2, 'ny': 2, 'business': 6, 'opinion': 4, 'tech': 2, 'science': 2, 'health': 5, 'sports': 2, 'arts': 2, 'books': 3, 'style': 2, 'food': 2, 'travel': 2, 'magazine': 4, 'real': 3, 'estate': 2, 'video': 2, 'listen': 3, 'daily': 2, 'texas': 2, 'storm': 1, 'processing': 1, 'remembering': 1, 'whitney': 1, 'houston': 1, 'ezra': 1, 'klein': 1, 'crisis': 1, 'normal': 1, 'david': 2, 'wallacewells': 1, 'leah': 1, 'stokes': 1, 'discuss': 2, 'protrump': 1, 'forces': 1, 'pushed': 1, 'lie': 2, 'antifa': 1, 'capitol': 1, 'riot': 1, 'jan': 1, '6': 5, 'supporters': 1, 'former

In [16]:
class OrderedWebWordsFrequency(WebWordsFrequency):
    def __init__(self, *l):
        WebWordsFrequency.__init__(self, *l)
    
    def getWordsFrequency(self, reverse = False):
        totalWords = WebWordsFrequency.getWordsFrequency(self)
        
        # 기본적으로 내림차순 정렬 후 반환
        li = []
        temp = list(totalWords.values())
        temp.sort(reverse = not reverse)
        for value in temp:
            for key in totalWords.keys():
                if value == totalWords[key]:
                    li.append((key, totalWords[key]))
                    del totalWords[key]
                    break
        return li

In [17]:
w4 = OrderedWebWordsFrequency('http://www.times.com', 'http://edition.cnn.com', 'https://github.com')
print(w4.getWordsFrequency())
print()
print()
print(w4.getWordsFrequency(reverse = True))

[('github', 39), ('rarr', 28), ('code', 20), ('cnn', 19), ('world', 15), ('global', 12), ('news', 10), ('vaccine', 10), ('actions', 10), ('vaccinations', 9), ('york', 8), ('videos', 8), ('search', 8), ('united', 8), ('change', 8), ('features', 8), ('learn', 8), ('↵', 8), ('build', 8), ('pull', 8), ('times', 7), ('video', 7), ('development', 7), ('review', 7), ('software', 7), ('security', 7), ('sign', 7), ('business', 6), ('tech', 6), ('food', 6), ('media', 6), ('biden', 6), ('28', 6), ('trackers', 6), ('hospitals', 6), ('restrictions', 6), ('tracker', 6), ('added', 6), ('tv', 6), ('octocatclassifier', 6), ('requests', 6), ('merge', 6), ('site', 5), ('international', 5), ('politics', 5), ('health', 5), ('sports', 5), ('style', 5), ('travel', 5), ('ago', 5), ('million', 5), ('feb', 5), ('14day', 5), ('deaths', 5), ('—', 5), ('found', 5), ('contact', 5), ('project', 5), ('2', 5), ('codespaces', 5), ('packages', 5), ('source', 5), ('community', 5), ('jump', 5), ('developers', 5), ('git', 

In [30]:
class OrderedWebWordsFrequency(WebWordsFrequency):
    def __init__(self, *l):
        WebWordsFrequency.__init__(self, *l)
    
    def getWordsFrequency(self, reverse = False):
        totalWords = WebWordsFrequency.getWordsFrequency(self)
        
        # 기본적으로 내림차순 정렬 후 반환
        li = []
        temp = list(totalWords.values())
        temp.sort(reverse = not reverse)
        for value in temp:
            for key in totalWords.keys():
                if value == totalWords[key]:
                    li.append((key, totalWords[key]))
                    del totalWords[key]
                    break
        return li
    
    def __iter__(self):
        self.i = -1
        self.tmpFrequency = self.getWordsFrequency()
        return self
    
    def __next__(self):
        self.i += 1
        if len(self.tmpFrequency) <= self.i:
            raise StopIteration
            
        return self.tmpFrequency[self.i]

In [31]:
w4 = OrderedWebWordsFrequency('http://www.times.com')
for i in w4:
    print(i)

('global', 12)
('vaccine', 10)
('vaccinations', 9)
('york', 8)
('times', 7)
('united', 6)
('trackers', 6)
('hospitals', 6)
('restrictions', 6)
('development', 6)
('world', 5)
('feb', 5)
('28', 5)
('14day', 5)
('change', 5)
('deaths', 5)
('site', 4)
('canada', 4)
('opinion', 4)
('magazine', 4)
('trump', 4)
('biden', 4)
('coronavirus', 4)
('heres', 4)
('uk', 4)
('—', 4)
('news', 3)
('business', 3)
('real', 3)
('listen', 3)
('time', 3)
('50925', 3)
('–26', 3)
('1129', 3)
('–21', 3)
('own', 3)
('workers', 3)
('amazon', 3)
('charges', 3)
('jacob', 3)
('lawrence', 3)
('via', 3)
('golden', 3)
('globes', 3)
('continue', 2)
('reading', 2)
('main', 2)
('story', 2)
('skip', 2)
('index', 2)
('international', 2)
('politics', 2)
('ny', 2)
('tech', 2)
('science', 2)
('health', 2)
('sports', 2)
('arts', 2)
('books', 2)
('style', 2)
('food', 2)
('travel', 2)
('estate', 2)
('video', 2)
('texas', 2)
('david', 2)
('discuss', 2)
('lie', 2)
('6', 2)
('president', 2)
('quickly', 2)
('live', 2)
('updates', 2)