In [3]:
import os
import re
import csv
import math
import tqdm
import argparse
import pathlib
import subprocess

In [4]:
snapshot_file = '../../data/engineroom/2018-03-01/enwiki.wikigraph.snapshot.2018-03-01.csv'
graph_file = '../../data/engineroom/2018-03-01/enwiki.wikigraph.pagerank.2018-03-01.csv'
file = './count_individual_loops.dat'

In [5]:
PATTERN_NAME=r'enwiki.looprank.(.*).4.2018-03-01.txt'
regex_name = re.compile(PATTERN_NAME)

In [6]:
lines = []
with open(file, 'r') as infile:
    reader = csv.reader(infile, delimiter='\t')

    for line in reader:
        match_name = regex_name.match(line[3])
        if match_name:
            name = match_name.group(1)
            lines.append({name: (int(line[0]),
                                 int(line[1]),
                                 int(line[2])
                                 )
                          })
        else:
            continue

In [7]:
lines[:10]

[{'100_Acres:_The_Virginia_B._Fairbanks_Art_and_Nature_Park': (7, 14, 251)},
 {'100_Crore_Club': (9, 143, 3439)},
 {'100th_Army_Band': (1, 3, 59)},
 {'10th_Parachute_Division_(France)': (15, 166, 3348)},
 {'116th_Street_(Manhattan)': (10, 83, 2521)},
 {'11th_IIFA_Awards': (8, 99, 3305)},
 {'11α-Hydroxyprogesterone': (5, 7, 138)},
 {"130–136_Piccott's_End": (2, 1, 10)},
 {'14th_Bangladesh_National_Film_Awards': (2, 9, 80)},
 {'1526_in_poetry': (3, 9, 56)}]

In [8]:
# Processing non-UTF-8 Posix filenames using Python pathlib?
# https://stackoverflow.com/a/45724695/2377454
def safe_path(path: pathlib.Path) -> pathlib.Path:
    if isinstance(path, pathlib.Path):
        encoded_path = path.as_posix().encode('utf-8')
    else:
        encoded_path = pathlib.Path(path).as_posix().encode('utf-8')
    return pathlib.Path(os.fsdecode(encoded_path))


In [9]:
# How to get line count cheaply in Python?
# https://stackoverflow.com/a/45334571/2377454
def count_file_lines(file_path: pathlib.Path) -> int:
    """
    Counts the number of lines in a file using wc utility.
    :param file_path: path to file
    :return: int, no of lines
    """

    num = subprocess.check_output(
        ['wc', '-l', safe_path(file_path).as_posix()])
    num = num.decode('utf-8').strip().split(' ')
    return int(num[0])

In [10]:
# print('* Read the "snapshot" file: ', file=sys.stderr)
snaplen = count_file_lines(snapshot_file)
snap_id2title = dict()
snap_title2id = dict()
with tqdm.tqdm(total=snaplen) as pbar:
    with safe_path(snapshot_file).open('r', encoding='utf-8') as snapfp:
        reader = csv.reader(snapfp, delimiter='\t')
        for l in reader:
            snap_id2title[int(l[0])] = l[1]
            snap_title2id[l[1]] = int(l[0])
            pbar.update(1)

100%|██████████| 13749291/13749291 [00:30<00:00, 458128.13it/s]


In [14]:
{key: snap_title2id[key] for key in list(snap_title2id.keys())[:10]}

{'AccessibleComputing': 0,
 'Anarchism': 1,
 'AfghanistanHistory': 2,
 'AfghanistanGeography': 3,
 'AfghanistanPeople': 4,
 'AfghanistanCommunications': 5,
 'AfghanistanTransportations': 6,
 'AfghanistanMilitary': 7,
 'AfghanistanTransnationalIssues': 8,
 'AssistiveTechnology': 9}

In [None]:
graph = []
graphlen = count_file_lines(graph_file)
with tqdm.tqdm(total=graphlen) as pbar:
    with safe_path(graph_file).open('r', encoding='utf-8') as graphfp:
        reader = csv.reader(graphfp, delimiter=' ')
        for l in reader:
            s = int(l[0])
            t = int(l[0])
            graph.append((s, t))

            pbar.update(1)

 91%|█████████ | 148000765/163380008 [08:42<01:04, 237536.76it/s] 

In [None]:
loglines = []
for line in lines:
    data2 = line[0]
    data3 = line[1]
    data4 = line[2]

    logdata2 = -1
    if data2 > 0:
        logdata2 = math.log(data2)

    logdata3 = -1
    if data3 > 0:
        logdata3 = math.log(data3)

    logdata4 = -1
    if data4 > 0:
        logdata4 = math.log(data4)

    data = (logdata2,
            logdata3,
            logdata4,
            ) 
    loglines.append(data)

In [None]:
loglines

In [None]:
ratios = []
ratios2 = []
for line in loglines:
    if line[0] > 0 and line[1] > 0 and line[2] > 0:
        data = (line[0]/(2.0*2.0), line[1]/(3.0*3.0), line[2]/(4.0*4.0))
        data2 = (line[1]/(line[0]), line[2]/(line[0]))
        ratios.append(data)
        ratios2.append(data2)

In [None]:
ratios[:10]

In [None]:
ratios2[:10]

In [None]:
flat_ratios = [item for sublist in ratios for item in sublist]

In [None]:
flat_ratios

In [None]:
import statistics
statistics.pvariance(flat_ratios)

In [None]:
statistics.mean(flat_ratios)