## Quiz 4: Hadoop for Fun and Profit 
Conor Meade\
CS 119

In [1]:
import functools
import requests
import re
import string
import sys
from collections import defaultdict

## 1. Functional Programming [25 points]

### (1.1) add(), sub(), and ra_sub()

In [2]:
def add(*num_list):
    sum = (functools.reduce(lambda x, y: x+y, num_list))
    return sum

print(add(1, 55, 45))
print(add(0, 1, 1, 2, 3, 5, 8, 13))
print(add(1, 2, 3))

def sub(*num_list):
    difference = (functools.reduce(lambda x, y: x-y, num_list))
    return difference

print(sub(5,1,2))

def ra_sub(*num_list):
    difference = (functools.reduce(lambda x, y: y - x, reversed(num_list)))
    return difference

print(ra_sub(5, 1, 2))
print(ra_sub(5))
print(ra_sub(5, 1, 2, 4, 6))

101
33
6
2
6
5
8


### (1.2) zip()

In [3]:
def zip(*num_sequences):
    zipped_lists = [list(map(lambda s: s[i], num_sequences)) for i in range(len(num_sequences[0]))]
    return zipped_lists

print(zip([1], [2]))
print(zip([1, 2, 3], [4, 5, 6]))
print(zip([1, 2, 3], [4, 5, 6], [7, 8, 9]))

[[1, 2]]
[[1, 4], [2, 5], [3, 6]]
[[1, 4, 7], [2, 5, 8], [3, 6, 9]]


### (1.3) zipwith

In [4]:
def zipwith(func, *num_sequences):
    result = list(map(lambda *args: func(*args), *num_sequences))
    return result

print(zipwith(add, [1, 2, 3], [4, 5, 6]))  # [5, 7, 9]
print(zipwith(add, [1, 2, 3], [4, 5, 6], [1, 1, 1]))
print(zipwith(sub, [1, 2, 3], [4, 5, 6], [1, 1, 1]))

[5, 7, 9]
[6, 8, 10]
[-4, -4, -4]


### (1.4) flatten()

In [5]:
def flatten(*tree):
    flat_list = functools.reduce(lambda acc, node: acc + flatten(*node) if isinstance(node, list) else acc + [node], tree, [])
    # flat_list = functools.reduce(lambda x,y: x+y, tree)
    return flat_list

print(flatten([1, [2, [3, 4], [5, 6], 7], 8, [9, 10]]))
print(flatten([[2, 3, 4], 6, [4, 4], [[1, 2], 3, [4, 7, 99]]]))


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[2, 3, 4, 6, 4, 4, 1, 2, 3, 4, 7, 99]


### (1.5) group_by()

In [6]:
def group_by(func, num_sequences):
    func_dict = {}

    for s in num_sequences:
        k = func(s)
        if k in func_dict.keys():
            func_dict[k].append(s)
        else:
            func_dict[k] = [s]
    return func_dict


group_by(len, ["hi", "dog", "me", "bad", "good"])

{2: ['hi', 'me'], 3: ['dog', 'bad'], 4: ['good']}

## 2. Confirming Hadoop Installation [15 points]


### (2.1) Acquire the cluster

![Acquire Cluster](Part2/create_cluster.jpeg)

After changing settings to allow for any ip to access, not just internal, and inputting the settings that are provided in Professor J's directions, I was able to create and run my cluster.

### (2.2)  Load the data into the master, move the data into HDFS

![Load Data, move data into HDFS](Part2/MoveFilesHDFS.png)

First I SSH'd into my clusted confirmed my hadoop version (`hadoop version`) and cloned the repo (`git clone https://github.com/singhj/big-data-repo.git`). These both ran fine and ouputted what was expected. Then I was able to use mkdir to create my directories in the hadoop fs. No errors here and ls returned this new directoires so that worked fine. `hadoop fs -put ~/big-data-repo/five-books/* /user/singhj/five-books` put the five-books data into my hadoop file system and `hadoop fs -ls /user/singhj/five-books` allowed me to confirm that all five books of data made it into my cluster. Ouput can be seen in attached picture.

### (2.3)  Without writing any code of your own, verify that you have a good installation of hadoop by running wordcount on five-books. The command is similar to...
![Books Count 1](Part2/books_count_1.png)
![Books Count 2](Part2/books_count_2.png)

Running `hadoop jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar wordcount /user/singhj/five-books /books-count` was used to confirm I had a good installation of hadoop. This process showed a mapping and reduce function applied to the five-books data in order to confirm Hadoop working. The output can be seen in the attached pictures. We can see a mapreduce job percentage completion breakdown in the middle of the first picture along with a successful MR job completed message. I then fetch the /books-count directory using `hadoop fs -get /books-count`. I return the results using `ls -la books-count/`. This looks to have worked, it returns:

```
total 320
drwxr-xr-x  2 cmeade6479 cmeade6479   4096 Oct 12 17:23 .
drwxr-xr-x 12 cmeade6479 cmeade6479   4096 Oct 12 17:23 ..
-rw-r--r--  1 cmeade6479 cmeade6479      0 Oct 12 17:23 _SUCCESS
-rw-r--r--  1 cmeade6479 cmeade6479 105799 Oct 12 17:23 part-r-00000
-rw-r--r--  1 cmeade6479 cmeade6479 103061 Oct 12 17:23 part-r-00001
-rw-r--r--  1 cmeade6479 cmeade6479 104969 Oct 12 17:23 part-r-00002
```

### (2.4)  Run wordcount using the provided mapper_noll.py and the default reducer aggregate

![mapper 1](Part2/mapred_1.png)
![mapper 2](Part2/mapred_2.png)
![mapper 3](Part2/mapred_3.png)

Similar to the last question, I apply the commands seen in the directions and was able to complete the MR tasks without issues. I fetch the results using  `hadoop fs -get /books-stream-count` and that runs without error. Running `ls -la books-stream-count/` to confirm word count is working and it looks like it does. That command returns a success message:

```
total 116
drwxr-xr-x  2 cmeade6479 cmeade6479  4096 Oct 12 17:27 .
drwxr-xr-x 13 cmeade6479 cmeade6479  4096 Oct 12 17:27 ..
-rw-r--r--  1 cmeade6479 cmeade6479     0 Oct 12 17:27 _SUCCESS
-rw-r--r--  1 cmeade6479 cmeade6479 34743 Oct 12 17:27 part-00000
-rw-r--r--  1 cmeade6479 cmeade6479 34964 Oct 12 17:27 part-00001
-rw-r--r--  1 cmeade6479 cmeade6479 33989 Oct 12 17:27 part-00002
```


### (2.5)  Run wordcount using the provided mapper_noll.py and the provided reducer reducer_noll.py

![mapper 4](Part2/mapred_4.png)
![mapper 5](Part2/mapred_5.png)
![mapper 6](Part2/mapred_6.png)

Similar to the previous two questions, I apply the commands seen in the directions and was able to complete the MR tasks without big issues. There was a small issue where the `-files` tag was not working so I used `-file` like in the last question and was able to run the command without issue. I fetch the results using  `hadoop fs -get /books-my-own-counts` and that runs without error. Running `ls -la books-my-own-counts/` to confirm word count is working and it looks like it does. That command returns a success message:

```
total 244
drwxr-xr-x  2 cmeade6479 cmeade6479  4096 Oct 12 17:31 .
drwxr-xr-x 14 cmeade6479 cmeade6479  4096 Oct 12 17:31 ..
-rw-r--r--  1 cmeade6479 cmeade6479     0 Oct 12 17:31 _SUCCESS
-rw-r--r--  1 cmeade6479 cmeade6479 79255 Oct 12 17:31 part-00000
-rw-r--r--  1 cmeade6479 cmeade6479 79515 Oct 12 17:31 part-00001
-rw-r--r--  1 cmeade6479 cmeade6479 77539 Oct 12 17:31 part-00002
```

## 3. Analyzing Server Logs [55 points]


### (3.1)  What is the percentage of each request type (GET, PUT, POST, etc.)

For this question, I wrote my own mapper and reduce functions, `request-type-mapper.py` and `request-type-reducer-count.py`. The mapper will return each request type along with a 1 for each row with that request type. This will be like: \

GET 1 \
POST 1 \
GET 1 \
GET 1 \
POST 1 \
HEAD 1 \
...

For the reducer function, it will take these line by line counts of 1 and return a count value for each request.  
I clone the repo for this project using `git clone https://github.com/ConorMeade/Quiz4`. 

To use the log file, mapper, and reducer, I have to load all the files in the Hadoop file system:
```console
$hadoop fs -mkdir /user/cmeade/Quiz4
```

```console
$hadoop fs -put ~/Quiz4/* /user/cmeade/Quiz4
```

Next, run the map reduce command

```console
$mapred streaming -file ~/Quiz4/request-type-mapper.py ~/Quiz4/request-type-reducer-count.py \
-mapper request-type-mapper.py   \
-reducer request-type-reducer-count.py \
-input /user/cmeade/access.log.txt \
-output /request-type-counts
```

`/request-type-counts` will have the part output files
```console
$hdfs dfs -ls /request-type-counts
```
Found 4 items \
-rw-r--r--   1 cmeade6479 hadoop          0 2024-10-13 00:52 /request-type-counts/_SUCCESS \
-rw-r--r--   1 cmeade6479 hadoop         21 2024-10-13 00:52 /request-type-counts/part-00000 \
-rw-r--r--   1 cmeade6479 hadoop          0 2024-10-13 00:52 /request-type-counts/part-00001 \
-rw-r--r--   1 cmeade6479 hadoop          9 2024-10-13 00:52 /request-type-counts/part-00002


The counts are split acoss different parts (00000 and 00002). So, combine all the part output files and output that into a text file req_counts
```console
$hdfs dfs -text /request-type-counts/part* > req_counts.txt
```

```text
<!-- req_counts.txt -->
GET     33414
POST    44584
HEAD    253
```

### Post Process Step

The request-type-reducer-count reducer file will give the text ouput of the counts. Use get_percentages_req() post processing to determine the percentage of each request.

In [32]:
def get_percentages_req():
    total_reqs = 0
    request_percentages = {}
    with open('req_counts.txt', 'r') as c:
        for line in c:
            # print(line)
            if line is not None or line != '\n':
                # varrying number of spaces so split on space and remove list elems that are empty/only spaces
                line_list = line.strip().split(' ')
                request_type, count = [item for item in line_list if item.strip() != ""]
                count = int(count)
                request_percentages[request_type] = count
                total_reqs += count

    # calculate percentage
    for request_type, count in request_percentages.items():
        percentage = (count / total_reqs) * 100
        print(f"{request_type}\t{count}\t{percentage:.2f}%")

get_percentages_req()


GET	33414	42.70%
POST	44584	56.98%
HEAD	253	0.32%


### Mapper and Reducer functions

Here are the mapper and reducer functions used in map reduce processing. mapper() used for debugging locally.

In [31]:
'''request-type-mapper.py'''
import sys
import re


def main(argv):
    line = sys.stdin.readline()
    # regex to get request type after opening quotation in access.log
    pattern = re.compile(r'\"(\w+)\s')
    try:
        while line:
            match = pattern.findall(line)
            if match:
                method = match[0] # Extract the request method (e.g., GET, POST, HEAD)
                print(f"{method}\t1")
            line = sys.stdin.readline()
                
    except EOFError as error:
        return None


if __name__ == "__main__":
    main(sys.argv)


def mapper(debug=False):
    # actually see what function returns
    output_filename = 'mapper_output.txt'
    with open(output_filename, 'w') as output_file:
        with open('access.log.txt', 'r') as f:
            for line in f:
                # line = sys.stdin.readline()
                pattern = re.compile(r'\"(\w+)\s')
                match = pattern.findall(line)
                if match:
                    method = match[0]  # Extract the request method (e.g., GET, POST)
                    output_file.write(f"{method} 1\n")
                    # print(f"{method}\t1")

mapper(debug=True)

In [33]:
#!/usr/bin/env python
'''request-type-reducer-count.py'''
import sys

# build dictionary element for distinct request type with the accompanying counts
request_count = {}
for line in sys.stdin:
    method, count = line.strip().split('\t', 1)
    try:
        count  = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue
    if method not in request_count:
        request_count[method] = 0

    request_count[method] += count

for method, count in request_count.items():
    print(f"{method}\t{count}")


GET	42.70%
POST	56.98%
HEAD	0.32%


### (3.2) What percent of the responses fall into each of the following five types?
For this question, I wrote my own mapper and reduce functions, `request-code-mapper.py` and `request-code-reducer-count.py`. The mapper will return each request type along with a 1 for each row with that request type. This will be like: \
 200	1 \
 200	1 \
 200	1 \
 200	1 \
 200	1 \
 301	1 \
 200	1 \
 301	1 \
 301	1 \
 404	1 \
 301	1 \
 200	1 



To use the log file, mapper, and reducer, I have to load all the files in the Hadoop file system:
```console
$hadoop fs -mkdir /user/cmeade/Quiz4
```

```console
$hadoop fs -put ~/Quiz4/* /user/cmeade/Quiz4
```

Next, run the map reduce command

```console
$mapred streaming -file ~/Quiz4/request-code-mapper.py ~/Quiz4/request-code-reducer-count.py \
-mapper request-code-mapper.py   \
-reducer request-code-reducer-count.py \
-input /user/cmeade/access.log.txt \
-output /request-code-counts
```

`/request-code-counts` will have the part output files
```console
$hdfs dfs -ls /request-code-counts
```
Found 4 items \
-rw-r--r--   1 cmeade6479 hadoop          0 2024-10-13 02:39 /request-code-counts/_SUCCESS \
-rw-r--r--   1 cmeade6479 hadoop         15 2024-10-13 02:39 /request-code-counts/part-00000 \
-rw-r--r--   1 cmeade6479 hadoop         29 2024-10-13 02:39 /request-code-counts/part-00001 \
-rw-r--r--   1 cmeade6479 hadoop         27 2024-10-13 02:39 /request-code-counts/part-00002


The counts are split acoss different parts (00000, 00001, and 00002). So, combine all the part output files and output that into a text file code_counts.
```console
$hdfs dfs -text /request-code-counts/part* > code_counts.txt
```
```text
<!-- code_counts.txt -->
303     1857
405     1
301     957
304     115
400     1
403     63
200     70559
206     125
404     4573
'''


### Post Process Step

The request-code-reducer-count reducer file will give the text ouput of the counts. Use get_percentages_code() post processing to determine the percentage of each request.

In [45]:
def get_percentages_code():
    response_code_counts = {
        "Informational responses (100–199)": 0,
        "Successful responses (200–299)": 0, 
        "Redirection messages (300–399)": 0,
        "Client error responses (400–499)": 0,
        "Server error responses (500–599)": 0
    }

    with open('code_counts.txt', 'r') as c:
        for line in c:
            # print(line)
            if line is not None or line != '\n':
                # varrying number of spaces so split on space and remove list elems that are empty/only spaces
                line_list = line.strip().split(' ')
                response_code, count = [item for item in line_list if item.strip() != ""]
                try:
                    response_code = int(response_code)
                    count = int(count)
                except ValueError:
                    # count or response_code was not a number, so silently
                    # ignore/discard this line
                    continue
                if 100 <= response_code < 200:
                    response_code_counts["Informational responses (100–199)"] += count
                elif 200 <= response_code < 300:
                    response_code_counts["Successful responses (200–299)"] += count
                elif 300 <= response_code < 400:
                    response_code_counts["Redirection messages (300–399)"] += count
                elif 400 <= response_code < 500:
                    response_code_counts["Client error responses (400–499)"] += count
                elif 500 <= response_code < 600:
                    response_code_counts["Server error responses (500–599)"] += count
                else:
                    print(f"Unknown Code reached {response_code}")

                
                total_reqs = sum(response_code_counts.values())

    for response, count in response_code_counts.items():
        percentage = (count / total_reqs) * 100
        print(f"{response}\t{percentage:.2f}%")

get_percentages_code()



Informational responses (100–199)	0.00%
Successful responses (200–299)	90.33%
Redirection messages (300–399)	3.74%
Client error responses (400–499)	5.93%
Server error responses (500–599)	0.00%


### Mapper and Reducer functions

Here are the mapper and reducer functions used in map reduce processing. reducer function is pprtty much the same except for changing variable names to account for request values vs return code values. mapper_code() used for debugging locally.

In [None]:
#!/usr/bin/env python
'''request-code-mapper.py'''
import sys
import re


def main(argv):
    line = sys.stdin.readline()
    pattern = re.compile(r'\" \d{3}')
    try:
        while line:
            match = pattern.findall(line)
            if match:
                method = match[0] # Extract the request method (e.g., GET, POST, HEAD)
                print(f"{method}\t1")
            line = sys.stdin.readline()
                
    except EOFError as error:
        return None


if __name__ == "__main__":
    main(sys.argv)

def mapper_code(debug=False):
    # actually see what function returns
    output_filename = 'mapper_log_code_output.txt'
    with open(output_filename, 'w') as output_file:
        with open('access.log.txt', 'r') as f:
            for line in f:
                # line = sys.stdin.readline()
                pattern = re.compile(r'\ \d{3}')
                match = pattern.findall(line)
                if match:
                    response_code = match[0]  # Extract the request response_code (e.g., 200, 400, 401, 500)
                    output_file.write(f"{response_code}\t1\n")
                    # print(f"{method}\t1")

mapper_code(debug=True)


#!/usr/bin/env python

'''request-code-reducer-count.py'''
import sys

request_count = {}


for line in sys.stdin:
    code, count = line.strip().split('\t', 1)
    try:
        count  = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue
    if code not in request_count:
        request_count[code] = 0

    request_count[code] += count

for code, count in request_count.items():
    print(f"{code}\t{count}")


### (3.3) What 5 IP addresses generate the most client errors

## 4. Presidential Speeches [15 points]

In [None]:
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = list(set(stopwords_list.decode().splitlines()))


def remove_stopwords(words):
    list_ = re.sub(r"[^a-zA-Z0-9]", " ", words.lower()).split()
    return [itm for itm in list_ if itm not in stopwords]


def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('[\d\n]', ' ', text)
    return ' '.join(remove_stopwords(text))


def valence(text):
    pass

def calc_valence(text):
    pass