# Python tutorials

## 1

### a 

In [1]:
L = [1, 2, 4, 8, 16]

In [2]:
print(type(L))

<class 'list'>


In [3]:
M = [number ** 2 for number in L] #check 'map'

In [4]:
print(M)

[1, 4, 16, 64, 256]


### b

In [18]:
def evenOut(L):
    return [i for i in L if i%2 == 0] # check 'filter'

In [20]:
N = evenOut(L)
N

[2, 4, 8, 16]

### c

In [84]:
file = open("SD201-TP1-list.txt", "r")

In [85]:
L = [int(i) for i in file.read().split()]

In [86]:
L

[1, 3, 9, 18, 27]

In [87]:
L = evenOut(L)

In [88]:
print(L)

[18]


## 2 Implementing PageRank in Python

### 1 PageRank Algorithm

In [23]:
def module(u, v):
    mod = 0
    if len(u) == len(v):
        for i in range(len(u)):
            mod += abs(u[i]-v[i])
    return mod

In [117]:
def pageRank(file, beta, eps):
    links = file.read().split('\n')
    lines = []
    pages = {}
    j = 0
    #create list of lists to represent the graph and produce dictionary containing the delta_out of each page
    for line in links:
        if line == "":
            continue
        line = line.split();
        lines.append([int(i) for i in line])
        u,v = lines[j]
        if u not in pages:
            pages[u] = 1
        else:
            pages[u] += 1
        j += 1
    #initialize the ranks vector
    n = len(pages)
    ranks = [1 / n] * n
    
    #NB: no need to initialize A matrix since its values are predetermined by the list of lines and dictionary of nodes
    #                       A[i][j] = [1 / pages[j] if [i,j] in lines else 0]
    
    #enter the iteration loop to reach the steady state
    t =0; tooSmall = False; ranksNext = [0] * n; bias = [(1-beta)/n] * n
    while t < 100 and not tooSmall:
        t+=1
        for i in range(n):
            for link in lines:
                src, dest = link
                if i == dest - 1:
                    ranksNext[i] += beta * (1/pages[src]) * ranks[src-1]
        ranksNext = [x + y for x, y in zip(ranksNext,bias)]
        tooSmall = module(ranks, ranksNext) < eps
        ranks = ranksNext
        ranksNext = [0] * n
    return (ranks)

In [118]:
# solve the first question
file = open("SD201-TP1-graph.txt", "r")
print(pageRank(file, 1, 0.1)) #usage: pageRank(graph, beta, epsilon)
file.close()

[0.2708333333333333, 0.13541666666666666, 0.15625, 0.14583333333333331, 0.13541666666666666, 0.15625]


##### Having the initial graph with neither dead ends nor spider webs, the page rank algorithm gave us a logical result. We clearly see that being at the page 1 is the most probable and that is logical since it is the only page of the example graph having two refferal pages ( two pages can take us to the page nb.1), meanwhile, every other page has only one referral thus they have a somewhat clos probability of being at them.

##### We can notice that for beta = 1 (random surfer ver.1), the probabilities vector of being at each page tends to [2/7, 1/7, 1/7, 1/7, 1/7, 1/7]. This result is expected because of the count of pages that are referring to another page. 
##### Also, for beta != 1 (beta = 0.8) where we adapted the Random Surfer v2 (ability to do a jump), the probabilities had a little break down, lowering the probability of being at page 1, and amplifying the probability of being at other pages (the effect of the jump ability makes other pages more probable to land on). But also, we can notice that being on pages 2 and 5 is less probable than (1, definitely) 3, 4 and 6: this is because once we land on 2 or 5, we are obliged to pass by 3->4 or 6, respectively. Hence 4 is the "most probable page to land on" between the "pages that are referred by only one page".



### 2 Extract the graph

In [144]:
import re
def makeGraphText(pages, textName):
    file = open(textName, 'w')
    graph=[]
    for pageName in pages:
        pageFrom = open("./pages/"+pageName, "r", encoding="utf-8")
        links = set(re.findall('a href="(\S+)"', pageFrom.read()))
        for pageTo in links:
            #link = [pages.index(pageName), pages.index(pageTo)]
            #if link not in graph:
            #    graph.append(link)
                file.write(f"{pages.index(pageName)+1} {pages.index(pageTo)+1}\n")
    file.close()

In [163]:
import os
pages = [page for page in os.listdir('./pages') if page.endswith(".html")]
makeGraphText(pages, "./SD201-TP1-graph2.txt")

file = open("SD201-TP1-graph2.txt", "r")
ranks = pageRank(file, 0.8, 0.1)
file.close()
sum = 0
for i in ranks:
    sum += i
    ranks[ranks.index(i)]=i*100
print(sum)
print(ranks) 


0.9999999999999999
[3.468762160126722, 0.7553825984974729, 1.1159466427404956, 1.4737744082024864, 2.8401018451271565, 2.160238209599977, 1.5220042118910495, 3.020196334863824, 2.158517753456203, 2.1593925862995245, 1.6247026495554966, 2.11106238408277, 2.3182890002136065, 1.1464916919309467, 3.7095453922430965, 1.2628982190975955, 0.9933820960785902, 1.9762258090694436, 1.322831867027545, 1.4139460232102241, 2.2317391006596616, 1.6384761558304732, 2.154696380518429, 2.1570155434260285, 1.7678972478243244, 1.1158122935073618, 2.295612721013476, 3.1283557718662744, 0.7229082275295009, 0.4940535360078805, 1.3459147543523582, 2.602507682009239, 1.493147830384201, 1.378184354529037, 1.5775825633947953, 1.0216293333272346, 0.9015437994524365, 2.6699232611395245, 1.6655162060149387, 3.3557322870136783, 1.5608302974101413, 1.0796661632778533, 4.27892818444078, 0.710866846074481, 2.4867019697837365, 1.7260127576164992, 0.4381419357643133, 1.477697396531313, 1.380821986356225, 2.034105624280337

##### 

In [166]:
max = ranks[0]
for i in ranks:
    if i > max:
        max = i
        maxi = ranks.index(i)
max
maxi
pages[maxi]

'Programming_language.html'

### 3 Remove Dead-Ends

In [None]:
def noDeadEnds(file):
    links = file.read().split('\n')
    lines = []
    pages = {}
    j = 0
    #create list of lists to represent the graph and produce dictionary containing the delta_out of each page
    for line in links:
        if line == "":
            continue
        line = line.split();
        lines.append([int(i) for i in line])
        u,v = lines[j]
        if u not in pages:
            pages[u] = 1
        else:
            pages[u] += 1
        j += 1

# TESTS *__ignore__*

In [135]:
import re
import os
pages = [page for page in os.listdir('./pages') if page.endswith(".html")]
#makeGraphText(pages)
page = open("./pages/Assembly_language.html", "r", encoding="utf-8")
#print(page.read())
links = re.findall('a href="(\S+)"', page.read())
print((set(links)))

{'C_(programming_language).html', 'Porting.html', 'Imperative_programming.html', 'Programming_language.html', 'COBOL.html', 'Binary_file.html', 'Subroutine.html', 'Low-level_programming_language.html', 'Python_(programming_language).html', 'Logic_programming.html', 'Virtual_machine.html', 'Comparison_of_programming_languages.html', 'Software_portability.html', 'Dynamic_programming_language.html', 'Lisp_(programming_language).html', 'List_of_programming_languages.html', 'Unix.html', 'Assembly_language.html', 'Java_(programming_language).html', 'Instruction_set.html', 'Computer.html', 'Fortran.html', 'Source_code.html', 'JavaScript.html', 'Object-oriented_programming.html', 'Computer_science.html', 'Compiler.html', 'C++.html', 'High-level_programming_language.html', 'Operating_system.html', 'Object_(computer_science).html', 'Scripting_language.html'}


In [128]:
a=[1,2,3]
a.index(3)
print(f"{a}")

[1, 2, 3]


In [129]:
dict = {1:"hi"}
1 in dict
a=[]
for i in range(12):
    a.append(i)
b=[0.2] * 12
print([x + y for x, y in zip(b,a)])

[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2, 11.2]


In [130]:
a=[0.2] * 10
a[0]= a[0]*2
a[0]

0.4

In [131]:
f = open("SD201-TP1-graph.txt", "r")
for line in f:
    u,v=[int(x) for x in line.split()]


In [132]:
# unused
def multiply(A, v):
    n = len(v)
    res = []
    for i in range(n):
        res[i] = [(A[i][j] * v[j]) for j in range(n)]