In [54]:
import docker
import uuid
import paramiko
import os
from timeit import default_timer as timer
from dataclasses import dataclass
import re
import requests
import statistics
import pandas as pd
import numpy as np

@dataclass
class HadoopRunResult:
    name: str
    output_path: str
    stdout: list[str]
    stderr: list[str]
    elapsed: str
        
@dataclass
class MultiRunResult:
    name: str
    results: list[HadoopRunResult]
    average: float
    output_path: str
        
    @staticmethod
    def fromResults(results: list[HadoopRunResult]):
        avg = statistics.mean(list(map(lambda x: int(x.elapsed), results)))
        return MultiRunResult(results[0].name, results, avg, results[0].output_path)
    
def run_n(f, n):
    results = []
    for i in range(n):
        results.append(f())
    return results
        
def get_elapsed_time(res):
    def get_id_from_res(res):
        for line in res[1]:
            m = re.search('job_([0-9_]*)', line)
            if m != None and m.group(1) != '':
                return m.group(1)
        return None
    x = requests.get(f'http://resourcemanager:8088/ws/v1/cluster/apps/application_{get_id_from_res(res)}')
    return x.json()['app']['elapsedTime']

def run_in_master(command):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect("namenode", username="root", password="pass")
    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(f"cd /app/ && . env_var.sh && {command}")
    return (ssh_stdout.readlines(), ssh_stderr.readlines())

def get_data_from_output_path(path):
    return f"{path}/merged.txt"

def print_hdfs_output(path):
    raw = run_in_master(f"hdfs dfs -cat {get_data_from_output_path(path)}")[0]
    print("\n".join(raw[0:1000]))

def merge_results(path):
    run_in_master(f"hdfs dfs -cat {path}/part-r-* | hdfs dfs -put - {path}/merged.txt")
    
    
def convert_results(all_results: list[MultiRunResult])-> pd.DataFrame:
    def convert_single_result(multirun_res: MultiRunResult) -> list[int]:
        return [res.elapsed for res in multirun_res.results] + [multirun_res.average]
    
    runs = len(all_results[0].results)
    labels = list(map(lambda x: f"run_{x} (ms)", range(runs))) + ['average (ms)']
    indexes = []
    converted_results = []
    for multirun_res in all_results:
        indexes.append(multirun_res.name)
        converted_results.append(convert_single_result(multirun_res))

    return pd.DataFrame(np.array(converted_results), columns = labels, index=indexes)

# Wpływ replikacji danych

In [55]:
def run_covid_01():
    covid_01_jar_path = "/data/master_volume/map_reduce_jars/covid_01.jar"
    covid_01_input_path = "/datasets/covid-dataset.jsonl"
    covid_01_output_path = "/out_covid_1" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {covid_01_jar_path} {covid_01_input_path} {covid_01_output_path}")
    merge_results(covid_01_output_path)
    return HadoopRunResult("Covid01", covid_01_output_path, res[0], res[1], get_elapsed_time(res))

In [56]:
def run_covid_02(covid_01_output_path):
    covid_02_jar_path = "/data/master_volume/map_reduce_jars/covid_02.jar"
    covid_02_input_path = get_data_from_output_path(covid_01_output_path)
    covid_02_output_path = "/out_covid_2" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {covid_02_jar_path} {covid_02_input_path} {covid_02_output_path}")
    merge_results(covid_02_output_path)
    return HadoopRunResult("Covid02",covid_02_output_path, res[0], res[1], get_elapsed_time(res))

In [57]:
def run_covid_03(covid_01_output_path):
    covid_03_jar_path = "/data/master_volume/map_reduce_jars/covid_03.jar"
    covid_03_input_path = get_data_from_output_path(covid_01_output_path)
    covid_03_output_path = "/out_covid_3" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {covid_03_jar_path} {covid_03_input_path} {covid_03_output_path}")
    merge_results(covid_03_output_path)
    return HadoopRunResult("Covid03",covid_03_output_path, res[0], res[1], get_elapsed_time(res))

In [58]:
def run_steam_01():
    steam_01_jar_path = "/data/master_volume/map_reduce_jars/steam_01_combine.jar"
    steam_01_input_path = "/datasets/steam-dataset/steam_dataset/appinfo/store_data/steam_store_data.jsonl"
    steam_01_input_path2 = "/datasets/steam-dataset/steam_dataset/steamspy/basic/steam_spy_scrap.jsonl"
    steam_01_output_path = "/out_steam_1" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {steam_01_jar_path} {steam_01_input_path} {steam_01_input_path2} {steam_01_output_path}")
    merge_results(steam_01_output_path)
    return HadoopRunResult("Steam01",steam_01_output_path, res[0], res[1], get_elapsed_time(res))

In [59]:
def run_steam_02(steam_01_output_path):
    steam_02_jar_path = "/data/master_volume/map_reduce_jars/steam_02_choose.jar"
    steam_02_input_path = get_data_from_output_path(steam_01_output_path)
    steam_02_output_path = "/out_steam_2" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {steam_02_jar_path} {steam_02_input_path} {steam_02_output_path}")
    merge_results(steam_02_output_path)
    return HadoopRunResult("Steam02",steam_02_output_path, res[0], res[1], get_elapsed_time(res))

In [60]:
def run_steam_03(steam_02_output_path):
    steam_03_jar_path = "/data/master_volume/map_reduce_jars/steam_03_takeN.jar"
    steam_03_input_path = get_data_from_output_path(steam_02_output_path)
    steam_03_output_path = "/out_steam_3" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {steam_03_jar_path} {steam_03_input_path} {steam_03_output_path}")
    merge_results(steam_03_output_path)
    return HadoopRunResult("Steam03",steam_03_output_path, res[0], res[1], get_elapsed_time(res))

In [61]:
def run_steam_04(steam_03_output_path):
    steam_04_jar_path = "/data/master_volume/map_reduce_jars/steam_04_fetch.jar"
    steam_04_input_path = get_data_from_output_path(steam_03_output_path)
    steam_04_output_path = "/out_steam_4" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {steam_04_jar_path} {steam_04_input_path} {steam_04_output_path}")
    merge_results(steam_04_output_path)
    return HadoopRunResult("Steam04",steam_04_output_path, res[0], res[1], get_elapsed_time(res))

In [62]:
def run_steam_05(steam_04_output_path, covid_02_output_path):
    steam_05_jar_path = "/data/master_volume/map_reduce_jars/steam_05_merge_time.jar"
    steam_05_input_path = get_data_from_output_path(steam_04_output_path)
    steam_05_input_path2 = get_data_from_output_path(covid_02_output_path)
    steam_05_output_path = "/out_steam_5" + str(uuid.uuid4())
    res = run_in_master(f"yarn jar {steam_05_jar_path} {steam_05_input_path} {steam_05_input_path2} {steam_05_output_path}")
    merge_results(steam_05_output_path)
    return HadoopRunResult("Steam05",steam_05_output_path, res[0], res[1], get_elapsed_time(res))

In [None]:
def run_pipeline(repeat_count: int):
    STEP_COUNT = repeat_count
    covid_01 = MultiRunResult.fromResults(run_n(run_covid_01, STEP_COUNT))
    covid_02 = MultiRunResult.fromResults(run_n(lambda: run_covid_02(covid_01.output_path), STEP_COUNT))
    covid_03 = MultiRunResult.fromResults(run_n(lambda: run_covid_03(covid_01.output_path), STEP_COUNT))
    steam_01 = MultiRunResult.fromResults(run_n(run_steam_01, STEP_COUNT))
    steam_02 = MultiRunResult.fromResults(run_n(lambda: run_steam_02(steam_01.output_path), STEP_COUNT))
    steam_03 = MultiRunResult.fromResults(run_n(lambda: run_steam_03(steam_02.output_path), STEP_COUNT))
    steam_04 = MultiRunResult.fromResults(run_n(lambda: run_steam_04(steam_03.output_path), STEP_COUNT))
    steam_05 = MultiRunResult.fromResults(run_n(lambda: run_steam_05(steam_04.output_path, covid_02.output_path), STEP_COUNT))
    return [covid_01, covid_02, covid_03, steam_01, steam_02, steam_03, steam_04, steam_05]

## 1 replika

In [None]:
_ = run_in_master("hdfs dfs -setrep -R 1 /")

In [None]:
pipeline_result = run_pipeline(3)
convert_results(pipeline_result)

## 3 repliki

In [None]:
_ = run_in_master("hdfs dfs -setrep -R 3 /")

In [None]:
pipeline_result = run_pipeline(3)
convert_results(pipeline_result)