<a href="https://colab.research.google.com/github/18708064/big-data/blob/master/postblock2_q4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Question 3.1

In [None]:
# Map function
def map(filename, content):
    # For each character in the content of the file
    for char in content:
        # Emit the character as the key and the value as 1
        EmitIntermediate(char, 1)

# Reduce function
def reduce(char, values):
    # Sum the counts for each character key
    result = sum(values)
    Emit(char, result)


Question 3.5

In [None]:
# Combiner function
def combiner(char, values):
    # Partially sum values before sending to reducer
    partial_sum = sum(values)
    Emit(char, partial_sum)


Question 4

In [None]:
def mapper(hostname, document):
    terms = document.split()
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        if count > 1:
            yield (hostname, (term, count))


def reducer(hostname, term_vectors):
    term_frequency = Counter()

    # Sum up term frequencies
    for term, count in term_vectors:
        term_frequency[term] += count

    # Emit terms that occur at least twice
    filtered_term_vector = {term: count for term, count in term_frequency.items() if count >= 2}

    yield (hostname, filtered_term_vector)


Question 4.2


In [10]:
from itertools import groupby
from operator import itemgetter
import re
import nltk
from collections import Counter  # Import Counter for counting term frequencies

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:

# Mapper function
def mapper(hostname, document):
    terms = document.split()  # Split the document into words (tokens)
    term_counts = Counter(terms)  # Count the occurrences of each term

    # Emit each term and its count
    for term, count in term_counts.items():
        if count > 1:  # Only emit terms that occur more than once
            yield (hostname, (term, count))

# Reducer function
def reducer(hostname, term_vectors):
    term_frequency = Counter()  # Initialize counter to store term frequencies

    # Aggregate the term frequencies
    for term, count in term_vectors:
        term_frequency[term] += count

    # Filter and emit terms that occur at least twice
    filtered_term_vector = {term: count for term, count in term_frequency.items() if count >= 2}

    yield (hostname, filtered_term_vector)

# Cleaner function to process lines and filter stopwords
def cleaner(line):
    # Lowercase all words, keep only alphabetical characters, and allow apostrophes
    words = re.findall(r'[a-z\']+', line.lower())

    for word in words:
        # Omit apostrophe 's' (assuming users won't type them in a search)
        word = word.replace("'", '')

        # Check if the word is empty or a stopword
        if word != '' and word not in stopwords.words('english'):
            yield word

# Function to sort intermediate results by key
def intermediate_sort(data):
    """
    Collect by key.
    """
    data = sorted(data)
    return [(k, list(tuple(zip(*g))[1])) for k, g in groupby(data, itemgetter(0))]

# Function to run the MapReduce process
def run(sources_dict):
    """
    Simulate the MapReduce process with map and reduce steps.
    :param sources_dict: Dictionary of document IDs and file paths.
    """
    map_result = []
    reduce_result = []

    # Apply the map function to each document
    for k, v in sources_dict.items():
        # Read the document and apply the map function
        with open(v, 'r') as f:
            map_result += list(mapper(k, f.read()))

    # Sort and group the map results by key
    intermediate_result = intermediate_sort(map_result)

    # Apply the reduce function to each key
    for elem in intermediate_result:
        reduce_result.append(list(reducer(elem[0], elem[1])))

    return map_result, intermediate_result, reduce_result


Shell Commands (for file setup):

In [12]:
!mkdir -p input/
!echo 'the cat sat on the mat' > input/d1.txt
!echo 'the dog sat on the log' > input/d2.txt


In [13]:
# Example to run and display results

map_result, intermediate_result, reduce_result = run({'D1': 'input/d1.txt', 'D2': 'input/d2.txt'})

# Print the results
print("Map Result:")
print(map_result)

print("\nIntermediate Result:")
print(intermediate_result)

print("\nReduce Result:")
print(reduce_result)


Map Result:
[('D1', ('the', 2)), ('D2', ('the', 2))]

Intermediate Result:
[('D1', [('the', 2)]), ('D2', [('the', 2)])]

Reduce Result:
[[('D1', {'the': 2})], [('D2', {'the': 2})]]
