In [1]:
from collections import Counter
from pathlib import Path
import operator
import heapq
import networkx as nx

Finds the number of times the "searched" word appears in the article

In [29]:
def read_text_file(filepath,word):
    count = 0
    with open(filepath) as f:
        a = f.readlines()
        for line in a:
            words = line.rstrip().lower().split()
            for i in words:
                if i == word:
                    count = count+1
        return count
    

In [31]:
def run_through_files(word):    
    pathlist = Path('plaintext_articles')
    Count_doc = dict()
    num_doc = dict()
    i = 0
    for path in pathlist.iterdir():
        path_in_str = str(path)
        count = read_text_file(path_in_str,word)
        Count_doc[path_in_str] = count
        num_doc[path_in_str] = i
        i+= 1
    max_100_words = heapq.nlargest(100, Count_doc, key=Count_doc.get)
    max_word = heapq.nlargest(10, Count_doc, key=Count_doc.get)
    return max_word, max_100_words, Count_doc

In [48]:
def run_through_files(list_word):    
    pathlist = Path('plaintext_articles')
    Count_doc = dict()
    num_doc = dict()
    i = 0
    for path in pathlist.iterdir():
        path_in_str = str(path)
        countlist = []
        for word in list_word:
            count = read_text_file(path_in_str,word)
            countlist.append(count)
        Count_doc[path_in_str] = countlist
        num_doc[path_in_str] = i
        i+= 1
    max_100_words = heapq.nlargest(100, Count_doc, key=Count_doc.get)
    max_word = heapq.nlargest(10, Count_doc, key=Count_doc.get)
    return max_word, max_100_words, Count_doc

Finds the page ranking for each article

In [36]:
import numpy as np

def make_p_matrix(matrix):
    row_sums = matrix.sum(axis=1)
    return matrix / row_sums[:]

def fix_dangling_node(H):
    # Finds which rows have all elements zeros
    # d is a column vector that identifies dangling nodes
    d = ~(H.any(axis=1))
    # w is a uniform row vector
    w = np.full(H.shape[1], 1/H.shape[1])
    S = H + d*w
    return S

def form_google_matrix(S, v=None, alpha=0.85):
    if v is None:
         v = np.full(S.shape[1], 1/S.shape[1])
    one_vector = np.ones(S.shape[0])
    return alpha*S + (1-alpha)*one_vector*v


def power_method2(H, v=None, alpha=0.85):
    if v is None:
         v = np.full(H.shape[1], 1/H.shape[1])
    d = ~(H.any(axis=1))
    w = np.full(H.shape[1], 1/H.shape[1])
    # start with pi = v
    pi = np.zeros(H.shape[0])
    pi_new = v
    cnt = 0
    print(H.shape)
    while not np.allclose(pi_new, pi):
        pi = pi_new
        pi_new = alpha * pi * H + alpha * (pi*d) * w + (1-alpha) * v
        cnt += 1
    #print("Power method2 went through {} iteration".format(cnt))
    return pi_new


def power_method1(H, v=None, alpha=0.85):
    S = fix_dangling_node(H)
    G = form_google_matrix(S, v=None, alpha=alpha)

    pi_new = np.full(G.shape[1], 1/G.shape[1])
    pi = np.zeros(G.shape[0])
    cnt = 0
    while not np.allclose(pi_new, pi):
        pi = pi_new
        pi_new = pi * G
        cnt += 1
    #print("Power method1 went through {} iteration".format(cnt))
    return pi_new



In [37]:
import matplotlib.pyplot as plt

def find_page_rank(g):
    def make_p_matrix(matrix):
        row_sums = matrix.sum(axis=1)
        return matrix / row_sums[:]
    
    def power_method2(H, v=None, alpha=0.85):
        if v is None:
             v = np.full(H.shape[1], 1/H.shape[1])
        d = ~(H.any(axis=1))
        w = np.full(H.shape[1], 1/H.shape[1])
        # start with pi = v
        pi = np.zeros(H.shape[0])
        pi_new = v
        cnt = 0
        while not np.allclose(pi_new, pi):
            pi = pi_new
            pi_new = alpha * pi * H + alpha * (pi*d) * w + (1-alpha) * v
            cnt += 1
        #print("Power method2 went through {} iteration".format(cnt))
        return pi_new
    x = []
    n = len(g)
    m = make_p_matrix(np.matrix(nx.convert_matrix.to_numpy_matrix(g), dtype=np.float64))
    c = power_method2(m)
    for i in range(n):
        x.append(c.item(i))  # PageRank Probability
    return x

In [38]:
def make_graph_from_dataset(filepath):
    with open(filepath) as f:
        a = f.readlines()
        edges = []
        for line in a:
            if line.startswith('#') or line.startswith('"#'):
                continue
            else:
                a,b = line.strip().split()
                edges.append((a,b))
    G = nx.Graph()
    G.add_edges_from(edges)
    return G

In [42]:
def compare(c_rank,max_100_words, Count_doc):
    page_rank = dict()
    for word in max_100_words:
        num = Count_doc.get(word)
        rank = c_rank[num]
        page_rank[word] = rank
    page_rank_10 = heapq.nlargest(10, page_rank, key=page_rank.get)
    return page_rank_10

Finds the page rank of the 100 articles with the highest frequency of the word and outputs the ten articles with the highest page rank.

In [49]:
g = make_graph_from_dataset('links.tsv')
c_rank = find_page_rank(g)
max_word, max_100_words, Count_doc = run_through_files(['obsure','town'])


In [46]:
page_rank_10 = compare(c_rank,max_100_words,Count_doc)
print(max_word)
print(page_rank_10)

['plaintext_articles/Templon.txt', 'plaintext_articles/Automatic_number_plate_recognition.txt', 'plaintext_articles/Voynich_manuscript.txt', 'plaintext_articles/Baghdad.txt', 'plaintext_articles/Sociocultural_evolution.txt', 'plaintext_articles/Homestar_Runner.txt', 'plaintext_articles/Michelangelo.txt', 'plaintext_articles/Floppy_disk.txt', 'plaintext_articles/Picts.txt', 'plaintext_articles/Mary_Seacole.txt']
['plaintext_articles/Homestar_Runner.txt', 'plaintext_articles/Genealogy.txt', 'plaintext_articles/Floppy_disk.txt', 'plaintext_articles/Baghdad.txt', 'plaintext_articles/Sociocultural_evolution.txt', 'plaintext_articles/Picts.txt', 'plaintext_articles/Mary_Seacole.txt', 'plaintext_articles/King_Arthur.txt', 'plaintext_articles/Italo_disco.txt', 'plaintext_articles/Michelangelo.txt']
