## List all files in the wiki folder

In [43]:
import os

file_names = os.listdir("wiki")
len(file_names)

file_names[2]

'Valentin_Yanin.html'

## Read the first file

In [44]:
with open(os.path.join("wiki", file_names[0])) as f:
    print(f.read())

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Bay of Concepción - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNa

## Adding the MapReduce function to this project

In [45]:
import math
import functools
from multiprocessing import Pool

def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    pool = Pool(num_processes)
    chunk_results = pool.map(mapper, chunks)
    pool.close()
    pool.join()
    return functools.reduce(reducer, chunk_results)

## Counting the total number of lines on all files

In [46]:
def map_line_count(file_names):
    total_lines = 0
    for fn in file_names:
        with open(os.path.join("wiki", fn)) as f:
            total_lines += len(f.readlines())
            
    return total_lines

def reduce_line_count(count1, count2):
    return count1 + count2

map_reduce(file_names, 8, map_line_count, reduce_line_count)

499797

## Grep string function

We defined a mapreduce_grep_string() function that takes two arguments as input:

A path to a folder. In the case of this guided project we will only use it on the wiki folder but having this argument makes the function easier to reuse.

The string that we want to find.

The mapper function receives a chunk of filenames and calculates all occurrences of the target string on them. If a file contains no occurrences, we chose to not include an entry for that file in the result dictionary.

The reducer function uses the dict.update() method to merge the result dictionaries.

Note that the target variable will be defined outside and will be the string we are looking for.


In [49]:
# The target variable is defined outside and contains the string 
def map_grep(file_names):
    results = {}
    for fn in file_names:
        with open(fn) as f:
            lines = [line for line in f.readlines()]
        for line_index, line in enumerate(lines):
            if target in line:
                if fn not in results:
                    results[fn] = []
                results[fn].append(line_index)
    return results

def reduce_grep(lines1, lines2):
    lines1.update(lines2)
    return lines1

def mapreduce_grep(path, num_processes):
    file_names = [os.path.join(path, fn) for fn in os.listdir(path)]
    return map_reduce(file_names, num_processes,  map_grep, reduce_grep)

## Finding the occurences of "data"

In [51]:
target = "data"
data_occurrences = mapreduce_grep("wiki", 8)

{'wiki/Bay_of_ConcepciC3B3n.html': [6, 45, 58, 60, 62, 105, 188, 205],
 'wiki/Bye_My_Boy.html': [276, 359, 376],
 'wiki/Valentin_Yanin.html': [101, 144, 227, 244],
 'wiki/Kings_XI_Punjab_in_2014.html': [221,
  229,
  237,
  245,
  253,
  269,
  277,
  293,
  301,
  317,
  325,
  341,
  374,
  376,
  381,
  383,
  388,
  390,
  395,
  397,
  402,
  564,
  647,
  664],
 'wiki/William_Harvey_Lillard.html': [45, 65, 81, 129, 212, 229],
 'wiki/Radial_Road_3.html': [52, 103, 301, 505, 588, 605],
 'wiki/George_Weldrick.html': [194, 277, 294],
 'wiki/Zgornji_Otok.html': [6, 53, 55, 65, 69, 211, 260, 262, 311, 394, 411],
 'wiki/Blue_Heelers_(season_8).html': [49,
  79,
  82,
  105,
  107,
  125,
  127,
  133,
  135,
  141,
  143,
  660,
  695,
  730,
  739,
  886,
  969,
  986],
 'wiki/Taggen_Nunatak.html': [6, 44, 46, 48, 93, 176, 193],
 'wiki/Henri_BraqueniC3A9.html': [43, 46, 92, 175, 192],
 'wiki/Vrila.html': [6, 57, 59, 69, 73, 99, 100, 102, 151, 234, 251],
 'wiki/William_Henry_Porter.html

## Allow for case insensitive matches

We can allow case insensitive matches by converting both the target and the file contents to lowercase before we match.

In [1]:
def map_grep_insensitive(file_names):
    results = {}
    for fn in file_names:
        with open(fn) as f:
            lines = [line.lower() for line in f.readlines()]
        for line_index, line in enumerate(lines):
            if target.lower() in line:
                if fn not in results:
                    results[fn] = []
                results[fn].append(line_index)
    return results

def mapreduce_grep_insensitive(path, num_processes):
    file_names = [os.path.join(path, fn) for fn in os.listdir(path)]
    return map_reduce(file_names, num_processes,  map_grep_insensitive, reduce_grep)

target = "data"
new_data_occurrences = mapreduce_grep_insensitive("wiki", 8)

NameError: name 'os' is not defined

## Checking that we find more matches

We already stored the results into variables data_occurrences and new_data_occurrences. To check that we find more matches with the second version of the algorithm, we can loop over the file names and print the length difference between the results.

In [None]:
for fn in new_data_occurrences:
    if fn not in data_occurrences:
        print("Found {} new matches on file {}".format(len(new_data_occurrences[fn]), fn))
    elif len(new_data_occurrences[fn]) > len(data_occurrences[fn]):
        print("Found {} new matches on file {}".format(len(new_data_occurrences[fn]) - len(data_occurrences[fn]), fn))