# Sprawozdanie 11

**Grupa A3:**

inż. Michał Liss

inż. Marceli Sokólski

inż. Piotr Krzystanek

## Definicje funkcji

In [None]:
import paramiko
import os
import re
import requests
import json
import time

In [None]:
def run_in_master(command):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect("namenode", username="root", password="pass")
    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(f"cd /app/ && . /env_var_path.sh && {command}")
    return (ssh_stdout.readlines(), ssh_stderr.readlines())

def merge_results(path):
    run_in_master(f"hdfs dfs -cat {path}/part-* | hdfs dfs -put - {path}/merged.txt")
    
def get_data_from_output_path(path):
    return f"{path}/merged.txt"

def print_hdfs_output(path):
    raw = run_in_master(f"hdfs dfs -cat {get_data_from_output_path(path)}")[0]
    print("\n".join(raw[0:11]))
    
def get_time(res, max_attempts: int = 6):
    def get_id(res):
        for line in res[1]:
            m = re.search('tracking URL: http://resourcemanager:8088/proxy/(.*)/', line)
            if m != None and m.group(1) != '':
                return m.group(1)
        return ''

    def get_time_from_data(data):
        sum = 0
        for attemp in data['attempts']:
            if attemp['completed'] is False:
                # print('spark history server has not updated (yet)')
                return -1
            sum = sum + attemp['duration']
        return sum
        
    id = get_id(res)
    if id == -1:
        return -1

    attempt = 0

    while attempt < max_attempts:
        attempt = attempt + 1
        response = requests.get(f'http://namenode:18080/api/v1/applications/{id}')
        if not response.ok:
            print('WARNING: application error')
            return -1
            
        data = json.loads(response.text)
        t = get_time_from_data(data)

        if t >= 0:
            return t
        time.sleep(5)
    print('WARNING: maximum attempts exceeded')
    return -1

runs = 10

# Covid

In [None]:
measurements_covid_sql = []
measurements_covid_df = []
measurements_covid_scala = []

## SQL

In [None]:
for r in range(runs):
    run_in_master(f"hdfs dfs -rm -r /spark-result/covid/sql")
    measurements_covid_sql.append(get_time(run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/spark_scripts/covid_sql.py")))
    print(f"run {r} took {measurements_covid_sql[-1]}ms")
    
merge_results("/spark-result/covid/sql")
print_hdfs_output("/spark-result/covid/sql")

## DataFrame

In [None]:
for r in range(runs):
    run_in_master(f"hdfs dfs -rm -r /spark-result/covid/df")
    measurements_covid_df.append(get_time(run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/spark_scripts/covid_df.py")))
    print(f"run {r} took {measurements_covid_df[-1]}ms")
    
merge_results("/spark-result/covid/df")
print_hdfs_output("/spark-result/covid/df")

## Scala Dataframe

In [None]:
for r in range(runs):
    run_in_master(f"hdfs dfs -rm -r /spark-result/covid01")
    measurements_covid_scala.append(get_time(run_in_master("spark-submit \
--master yarn \
--deploy-mode cluster \
--class covid01.Main \
/data/master_volume/spark_scripts/spark.jar ")))
    print(f"run {r} took {measurements_covid_scala[-1]}ms")
    
merge_results("/spark-result/covid01")
print_hdfs_output("/spark-result/covid01")

# Steam

In [None]:
measurements_steam_sql = []
measurements_steam_df = []
measurements_steam_scala = []

## SQL

In [None]:
for r in range(runs):
    run_in_master(f"hdfs dfs -rm -r /spark-result/steam/sql")
    measurements_steam_sql.append(get_time(run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/spark_scripts/steam_sql.py")))
    print(f"run {r} took {measurements_steam_sql[-1]}ms")
    
merge_results("/spark-result/steam/sql")
print_hdfs_output("/spark-result/steam/sql")

## DataFrame

In [None]:
for r in range(runs):
    run_in_master(f"hdfs dfs -rm -r /spark-result/steam/df")
    measurements_steam_df.append(get_time(run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/spark_scripts/steam_df.py")))
    print(f"run {r} took {measurements_steam_df[-1]}ms")
    
merge_results("/spark-result/steam/df")
print_hdfs_output("/spark-result/steam/df")

## Scala Dataframe

In [None]:
for r in range(runs):
    run_in_master(f"hdfs dfs -rm -r /spark-result/steam01")
    measurements_steam_scala.append(get_time(run_in_master("spark-submit \
--master yarn \
--deploy-mode cluster \
--class steam01.Main \
/data/master_volume/spark_scripts/spark.jar ")))
    print(f"run {r} took {measurements_steam_scala[-1]}ms")
    
merge_results("/spark-result/steam01")
print_hdfs_output("/spark-result/steam01")

# Wyniki

In [None]:
m_s_d = " | ".join([str(x) for x in measurements_steam_df])
m_s_s = " | ".join([str(x) for x in measurements_steam_sql])
m_c_d = " | ".join([str(x) for x in measurements_covid_df])
m_c_s = " | ".join([str(x) for x in measurements_covid_sql])

print(f"|     |  covid  |  steam  |")
print(f"|-----|---------|---------|")
print(f"| df  | {sum(measurements_covid_df) / len(measurements_covid_df)} | {sum(measurements_steam_df) / len(measurements_steam_df)} |")
print(f"| sql | {sum(measurements_covid_sql) / len(measurements_covid_sql)} | {sum(measurements_steam_sql) / len(measurements_steam_sql)} |")

print()
r = " | ".join([f"run {x}" for x in range(runs)])
print(f"|         | {r} |")
print(f"|steam df |{m_s_d}|")
print(f"|steam sql|{m_s_s}|")
print(f"|covid df |{m_c_d}|")
print(f"|covid sql|{m_c_s}|")

In [None]:
measurements_steam_scala

In [None]:
m_s_d = " | ".join([str(x) for x in measurements_steam_df])
m_s_s = " | ".join([str(x) for x in measurements_steam_sql])
m_s_scala = " | ".join([str(x) for x in measurements_steam_scala])
m_c_d = " | ".join([str(x) for x in measurements_covid_df])
m_c_s = " | ".join([str(x) for x in measurements_covid_sql])
m_c_scala =" | ".join([str(x) for x in measurements_covid_scala])

print(f"|       |  covid  |  steam  |")
print(f"|-------|---------|---------|")
print(f"| df    | {sum(measurements_covid_df) / len(measurements_covid_df)} | {sum(measurements_steam_df) / len(measurements_steam_df)} |")
print(f"| sql   | {sum(measurements_covid_sql) / len(measurements_covid_sql)} | {sum(measurements_steam_sql) / len(measurements_steam_sql)} |")
print(f"| scala | {sum(measurements_covid_scala) / len(measurements_covid_scala)} | {sum(measurements_steam_scala) / len(measurements_steam_scala)} |")

print()
r = " | ".join([f"run {x}" for x in range(runs)])
print(f"|           | {r} |")
print(f"|steam df   |{m_s_d}|")
print(f"|steam sql  |{m_s_s}|")
print(f"|steam sca  |{m_s_scala}|")
print(f"|covid df   |{m_c_d}|")
print(f"|covid sql  |{m_c_s}|")
print(f"|covid scala|{m_c_scala}|")