In [1]:
import gzip
import csv 
import numpy as np
import tarfile

In [2]:
INPUT_HOSTNAMES = "uk-2007-05.hostnames.txt.gz"
HOSTGRAPH = "uk-2007-05.hostgraph_weighted.graph-txt.gz"
WEBSPAM_1 = "WEBSPAM-UK2007-SET1-labels.txt"
WEBSPAM_2 = "WEBSPAM-UK2007-SET2-labels.txt"
ALPHA = 0.85

In [3]:
def readTxt():
    """
    It reads the file and create the id2name dictionary
    :return: id2name dictionary in which the key is the id and the value is the url of the site
    """
    with gzip.open(INPUT_HOSTNAMES, "rt", encoding="utf-8") as input_file:
        id2name = {}
        reader = csv.reader(input_file, delimiter=' ', quotechar='"')
        for record in reader:
            i = int(record[0])
            id2name[i] = record[1]
    return id2name

In [4]:
def pageRank(it):
    """
    It executes the page rank algorithm 
    :it: number of iterations (int)
    :return: an array with the page rank score of each node. 
    """
    pageRankLen = len(id2name)
    pageRankVec = [1/pageRankLen]*pageRankLen
    pageRankAux = [0]*pageRankLen
    for i in range(it):
        with gzip.open(HOSTGRAPH, "rt", encoding="utf-8") as input_file:
            reader = csv.reader(input_file, delimiter=' ', quotechar='"')
            row1 = next(reader)  # gets the first line
            #index is the position in the pageRankVec where we store the score of the source node 
            index = 0
            for src in reader:
                for entry in src:
                    splitted = entry.split(":")
                    dest = splitted[0]
                    #weight = splitted[1]
                    outdegree = len(src)
                    pageRankAux[int(dest)] += pageRankVec[index]/outdegree
                    
                index += 1
            #once we have computed the simplified page rank, we compute the value of the page rank for each node.
            for j in range(len(pageRankVec)):
                pageRankVec[j] = ALPHA*pageRankAux[j] + (1-ALPHA)/pageRankLen
                pageRankAux[j] = 0
    return pageRankVec

In [5]:
def spamHosts():
    """
    It creates a dictionary that contains the sites that are spammers.
    :return: dictionary in which the key is the node id and the value is True for everyone. 
    """
    #we read the file that contains the set 1 of sites.
    fd1 = open(WEBSPAM_1,"r")
    is_spam = {}
    for line in fd1:
        splitted = line.split(" ")
        #we check if is a spammer
        if splitted[1] == "spam":
            key = splitted[0]
            is_spam[key] = True
    fd1.close()
    #it reads the file that contains the set 2 of sites.
    fd2 = open(WEBSPAM_2,"r")
    for line in fd2:
        splitted = line.split(" ")
        #we check if is a spammer
        if splitted[1] == "spam":
            key = splitted[0]
            is_spam[key] = True
    fd2.close()
    return is_spam

In [6]:
def noSpamPageRank(it):
    """
    It computes the page rank excluding the nodes that are spammers
    :it: number of iterations (int)
    :return: a list with the page rank score of each node. 
    """
    is_spam = spamHosts()
    pageRankLen = len(id2name)
    pageRankVec = [1/pageRankLen]*pageRankLen
    for i in range(it):
        pageRankAux = [0]*pageRankLen
        with gzip.open(HOSTGRAPH, "rt", encoding="utf-8") as input_file:
            reader = csv.reader(input_file, delimiter=' ', quotechar='"')
            row1 = next(reader)  # gets the first line
            #index is the position in the pageRankVec where we store the score of the source node 
            index = 0
            for src in reader:
                #we check that the source is not in the is_spam dictionary
                if str(index) not in is_spam.keys():
                    outlinks = []
                    for entry in src:
                        splitted = entry.split(":")
                        dest = splitted[0]
                        #weight = splitted[1]
                         #we check that the target is not in the is_spam dictionary
                        if dest not in is_spam:
                            outlinks.append(dest)
                        
                    for dest in outlinks:
                        pageRankAux[int(dest)] += pageRankVec[index]/(len(outlinks))
                index += 1
            #once we have computed the simplified page rank, we compute the value of the page rank for each node.
            for j in range(len(pageRankVec)):
                pageRankVec[j] = ALPHA*pageRankAux[j] + (1-ALPHA)/pageRankLen
    return pageRankVec

In [7]:
id2name = readTxt()
iterations = 20


In [8]:
#Top 20 places PageRank

#Execute PageRank
pageRankVec = pageRank(iterations)
hosts_by_pagerank = sorted(enumerate(pageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:20]
print("##########top 20 Sites Page Rank##########")
for host in topHosts:
    print(str(host[1]) + " " + id2name[host[0]])


##########top 20 Sites Page Rank##########
0.0015168643782398274 www.opsi.gov.uk
0.0014182182459715198 www.adobe.co.uk
0.0009654563807161581 www.ico.gov.uk
0.00089555331065544 www.dti.gov.uk
0.0008937889977065464 www.defra.gov.uk
0.000780473245157769 news.bbc.co.uk
0.0007209475362666682 www.direct.gov.uk
0.000697529301060852 www.dfes.gov.uk
0.0006817074015284916 www.fsa.gov.uk
0.0006581577785798681 www.nationalrail.co.uk
0.0006554311467163489 www.communities.gov.uk
0.0006482874881544456 www.bbc.co.uk
0.0006028805944996983 www.google.co.uk
0.0005906484974062024 www.dh.gov.uk
0.0005818417136696459 www.hmso.gov.uk
0.0005757813160594587 www.hse.gov.uk
0.000540229700172051 www.fco.gov.uk
0.0005155226994647975 www.nationaltrust.org.uk
0.000483592734339035 www.homeoffice.gov.uk
0.00045848917724008694 mysite.wanadoo-members.co.uk


In [9]:
#Top 20 Execute PageRank NO SPAM
noSpamPageRankVec = noSpamPageRank(iterations)
hosts_by_pagerank = sorted(enumerate(noSpamPageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:20]

print("##########top 20 Sites Page Rank w/o Spam##########")
for host in topHosts:
    print(str(host[1]) + " " + id2name[host[0]])


##########top 20 Sites Page Rank w/o Spam##########
0.0015163269912291798 www.opsi.gov.uk
0.0014177550573844725 www.adobe.co.uk
0.0009642175795585692 www.ico.gov.uk
0.0008953203325559037 www.dti.gov.uk
0.0008925273921990404 www.defra.gov.uk
0.0007798118062589196 news.bbc.co.uk
0.0007204877150489438 www.direct.gov.uk
0.0006978022269123422 www.dfes.gov.uk
0.0006801444750391456 www.fsa.gov.uk
0.0006574169744247544 www.nationalrail.co.uk
0.0006552851670602633 www.communities.gov.uk
0.0006481317727919905 www.bbc.co.uk
0.0006023073035861512 www.google.co.uk
0.0005910116655537075 www.dh.gov.uk
0.0005818574145054327 www.hmso.gov.uk
0.00057537938842279 www.hse.gov.uk
0.000539037997378092 www.fco.gov.uk
0.0005154578899466747 www.nationaltrust.org.uk
0.0004833514113988187 www.homeoffice.gov.uk
0.00045755162909262494 mysite.wanadoo-members.co.uk


In [10]:
#Top 20 places PageRank co.uk

#Execute PageRank
pageRankVec = pageRank(iterations)
hosts_by_pagerank = sorted(enumerate(pageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:100]
print("##########top 20 Sites Page Rank with co.uk domain ##########")
count = 20
for host in topHosts:
    if "co.uk" in id2name[host[0]]:
        print(str(host[1])+" "+id2name[host[0]])
        if count==0:
            break
        count -=1


##########top 20 Sites Page Rank with co.uk domain ##########
0.0014182182459715198 www.adobe.co.uk
0.000780473245157769 news.bbc.co.uk
0.0006581577785798681 www.nationalrail.co.uk
0.0006482874881544456 www.bbc.co.uk
0.0006028805944996983 www.google.co.uk
0.00045848917724008694 mysite.wanadoo-members.co.uk
0.0004268162067476992 www.actinic.co.uk
0.0003640398196569995 www.networkrail.co.uk
0.00032710620860882734 www.caa.co.uk
0.00032325517863542096 www.erolonline.co.uk
0.00031455563140291595 www.punterlink.co.uk
0.00030441339496852974 www.streetmap.co.uk
0.00030310453842517744 www.tso.co.uk
0.0002926680894983065 www.kelkoo.co.uk
0.00028086861863899906 www.guardian.co.uk
0.0002781833278394606 www.rac.co.uk
0.0002638469923639058 www.event-management-uk.co.uk
0.00024662598481841865 www.telegraph.co.uk
0.00023703672636871383 www.investorsinpeople.co.uk
0.00021834116672779356 www.business-directory-uk.co.uk
0.0002080517504550283 www.infotex.co.uk


In [11]:
#Top 20 places PageRank co.uk NO SPAM

#Execute PageRank
noSpampageRankVec = noSpamPageRank(iterations)
hosts_by_pagerank = sorted(enumerate(noSpampageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:100]
count = 20
print("##########top 20 Sites Page Rank with co.uk domain w/o Spam ##########")
for host in topHosts:
    if "co.uk" in id2name[host[0]]:
        print(str(host[1])+" "+id2name[host[0]])
        if count==0:
            break
        count -=1


##########top 20 Sites Page Rank with co.uk domain w/o Spam ##########
0.0014177550573844725 www.adobe.co.uk
0.0007798118062589196 news.bbc.co.uk
0.0006574169744247544 www.nationalrail.co.uk
0.0006481317727919905 www.bbc.co.uk
0.0006023073035861512 www.google.co.uk
0.00045755162909262494 mysite.wanadoo-members.co.uk
0.0004275822265788012 www.actinic.co.uk
0.000363578117342681 www.networkrail.co.uk
0.00032670701629315484 www.caa.co.uk
0.0003173640738929826 www.erolonline.co.uk
0.00031651948957987466 www.punterlink.co.uk
0.00030424951060941416 www.streetmap.co.uk
0.0003028006825034482 www.tso.co.uk
0.0002928870966451433 www.kelkoo.co.uk
0.00028050984576233996 www.guardian.co.uk
0.0002779493305391587 www.rac.co.uk
0.0002659336091578067 www.event-management-uk.co.uk
0.0002461311067730707 www.telegraph.co.uk
0.00023691730911530067 www.investorsinpeople.co.uk
0.00021903587788696993 www.business-directory-uk.co.uk
0.00020761400425286887 www.infotex.co.uk


In [12]:
#Top 20 places PageRank gov.uk

#Execute PageRank
pageRankVec = pageRank(iterations)
hosts_by_pagerank = sorted(enumerate(pageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:100]
count = 20
print("##########top 20 Sites Page Rank with gov.uk domain ##########")
for host in topHosts:
    if "gov.uk" in id2name[host[0]]:
        print(str(host[1])+" "+id2name[host[0]])
        if count==0:
            break
        count -=1

##########top 20 Sites Page Rank with gov.uk domain ##########
0.0015168643782398274 www.opsi.gov.uk
0.0009654563807161581 www.ico.gov.uk
0.00089555331065544 www.dti.gov.uk
0.0008937889977065464 www.defra.gov.uk
0.0007209475362666682 www.direct.gov.uk
0.000697529301060852 www.dfes.gov.uk
0.0006817074015284916 www.fsa.gov.uk
0.0006554311467163489 www.communities.gov.uk
0.0005906484974062024 www.dh.gov.uk
0.0005818417136696459 www.hmso.gov.uk
0.0005757813160594587 www.hse.gov.uk
0.000540229700172051 www.fco.gov.uk
0.000483592734339035 www.homeoffice.gov.uk
0.0004572991098763979 www.dft.gov.uk
0.00044551620266407174 www.dataprotection.gov.uk
0.00043753647957066734 www.dwp.gov.uk
0.0004196267367553457 www.legislation.hmso.gov.uk
0.0003958947431418644 www.informationcommissioner.gov.uk
0.00037184244017089766 www.statistics.gov.uk
0.0003704741179414513 www.hm-treasury.gov.uk
0.0003391975486385459 www.tfl.gov.uk


In [13]:
#Top 20 places PageRank gov.uk NO SPAM

#Execute PageRank
noSpampageRankVec = noSpamPageRank(iterations)
hosts_by_pagerank = sorted(enumerate(noSpampageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:100]
count = 20
print("##########top 20 Sites Page Rank with gov.uk domain w/o spam##########")
for host in topHosts:
    if "gov.uk" in id2name[host[0]]:
        print(str(host[1]) +" "+ id2name[host[0]])
        
        if count==0:
            break
        count -=1

##########top 20 Sites Page Rank with gov.uk domain w/o spam##########
0.0015163269912291798 www.opsi.gov.uk
0.0009642175795585692 www.ico.gov.uk
0.0008953203325559037 www.dti.gov.uk
0.0008925273921990404 www.defra.gov.uk
0.0007204877150489438 www.direct.gov.uk
0.0006978022269123422 www.dfes.gov.uk
0.0006801444750391456 www.fsa.gov.uk
0.0006552851670602633 www.communities.gov.uk
0.0005910116655537075 www.dh.gov.uk
0.0005818574145054327 www.hmso.gov.uk
0.00057537938842279 www.hse.gov.uk
0.000539037997378092 www.fco.gov.uk
0.0004833514113988187 www.homeoffice.gov.uk
0.00045716426001675783 www.dft.gov.uk
0.0004445341940639843 www.dataprotection.gov.uk
0.0004373521638385658 www.dwp.gov.uk
0.000419431970034124 www.legislation.hmso.gov.uk
0.0003956975759919554 www.informationcommissioner.gov.uk
0.00037178722668428164 www.statistics.gov.uk
0.00037000563458066744 www.hm-treasury.gov.uk
0.0003391405216973119 www.tfl.gov.uk


In [14]:
#It computes the gain of score 
gain = []

pageRankVec = pageRank(iterations)
noSpamPageRankVec = noSpamPageRank(iterations)

for i in range(len(pageRankVec)):
    gain.append(pageRankVec[i]/noSpamPageRankVec[i]) 

gain_hosts = sorted(enumerate(gain), key=lambda x: x[1], reverse=True)
topGainHosts = gain_hosts[:20]
for host in topGainHosts:
    print(str(host[1]) + " " + id2name[host[0]])

33.14277368425596 www.escortnet.co.uk
29.058929855290366 www.missionfish.org.uk
17.898491711374145 www.statistics.006.free-counter.co.uk
13.6422600947175 www.uk-shoponline.co.uk
10.800429380479825 www.shop.co.uk
10.417142543549902 www.geordie-girls.co.uk
10.353438590154301 www.into.demon.co.uk
10.069001846752684 www.computerarts.co.uk
9.320732629825939 www.aili.co.uk
8.869703491243296 connect4fun.co.uk
8.452918216407495 www.kompass.co.uk
8.003510769421466 www.mercurywd.co.uk
7.8800664219899685 www.theshopping-centre.co.uk
7.824324020347563 www.markwarner.co.uk
7.7418348004899045 www.suppliersnearby.co.uk
7.67197459115261 www.quality-site-finder.co.uk
7.531583765512691 www.hertfordshiremobilediscos.co.uk
6.78796705669567 www.eastwoodtoday.co.uk
6.666666666666665 www.jlc.me.uk
5.9640753129903645 www.ideas21.co.uk


In [15]:
def myPageRank(it):
    """
    It executes the page rank algorithm 
    :it: number of iterations (int)
    :return: an array with the page rank score of each node. 
    """
    
    pageRankLen = len(id2name)
    pageRankVec = [1/pageRankLen]*pageRankLen
    pageRankAux = [0]*pageRankLen
    for i in range(it):
        with gzip.open(HOSTGRAPH, "rt", encoding="utf-8") as input_file:
            reader = csv.reader(input_file, delimiter=' ', quotechar='"')
            row1 = next(reader)  # gets the first line
            #index is the position in the pageRankVec where we store the score of the source node 
            index = 0
            for src in reader:
                for entry in src:
                    splitted = entry.split(":")
                    dest = splitted[0]
                    #weight = splitted[1]
                    pageRankAux[int(dest)] += pageRankVec[index]/np.sqrt((len(src)))
                    
                index += 1
                
            for j in range(len(pageRankVec)):
                pageRankVec[j] = ALPHA*pageRankAux[j] + (1-ALPHA)/pageRankLen
                pageRankAux[j] = 0
            
            suma = sum(pageRankVec)
            for k in range(len(pageRankVec)):
                pageRankVec[k] = pageRankVec[k]/suma
                

    return pageRankVec

In [16]:
#Top 20 places myPageRank

#Execute PageRank
pageRankVec = myPageRank(iterations)
hosts_by_pagerank = sorted(enumerate(pageRankVec), key=lambda x: x[1], reverse=True)
topHosts = hosts_by_pagerank[:20]
print("##########top 20 Sites Page Rank##########")
for host in topHosts:
    print(str(host[1]) + " " + id2name[host[0]])

##########top 20 Sites Page Rank##########
0.0044283353851542135 www.dataprotection.gov.uk
0.003529314947193229 www.libdems.org.uk
0.0035153398624564874 www.prai.co.uk
0.003508030404579182 islington-libdems.org.uk
0.0035023627094074306 warwick-leamington-libdems.org.uk
0.0035023021019102616 libdems4london.org.uk
0.0035021764694591674 montlibdems.org.uk
0.0035021670586702232 chichesterlibdems.org.uk
0.0035021519500236936 surreyheathlibdems.org.uk
0.003502150155631655 bobrussell.org.uk
0.00350205251233454 stevegoddard.org.uk
0.0035020273066072526 emilygasson.org.uk
0.0035020273066072526 jameskeeley.org.uk
0.0035002919381198015 bracknell-libdems.org.uk
0.0035002871135787527 darren4streatham.org.uk
0.0035002871135787527 friendsofstoneymiddletonschool.org.uk
0.0035002871135787527 garylawson.org.uk
0.0035002871135787527 jamesquinlanforparliament.org.uk
0.0035002871135787527 liberty-network.org.uk
0.0035002871135787527 lizleffman.org.uk
