In [1]:
import paramiko
import os
import re
import requests
import json
import time

In [2]:
def run_in_master(command):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect("namenode", username="root", password="pass")
    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(f"cd /app/ && . /env_var_path.sh && {command}")
    return (ssh_stdout.readlines(), ssh_stderr.readlines())

def merge_results(path):
    run_in_master(f"hdfs dfs -cat {path}/part-* | hdfs dfs -put - {path}/merged.txt")
    
def get_data_from_output_path(path):
    return f"{path}/merged.txt"

def print_hdfs_output(path):
    raw = run_in_master(f"hdfs dfs -cat {get_data_from_output_path(path)}")[0]
    print("\n".join(raw[0:1000]))
    
def get_time(res, max_attempts: int = 4):
    def get_id(res):
        for line in res[1]:
            m = re.search('tracking URL: http://resourcemanager:8088/proxy/(.*)/', line)
            if m != None and m.group(1) != '':
                return m.group(1)
        return ''

    def get_time_from_data(data):
        sum = 0
        for attemp in data['attempts']:
            if attemp['completed'] is False:
                print('spark history server has not updated (yet)')
                return -1
            sum = sum + attemp['duration']
        return sum
        
    id = get_id(res)
    if id == -1:
        return -1

    attempt = 0

    while attempt < max_attempts:
        attempt = attempt + 1
        response = requests.get(f'http://namenode:18080/api/v1/applications/{id}')
        if not response.ok:
            print('WARNING: application error')
            return -1
            
        data = json.loads(response.text)
        t = get_time_from_data(data)

        if t >= 0:
            return t
        time.sleep(3)
    print('WARNING: maximum attempts exceeded')
    return -1

# Covid

In [3]:
run_in_master(f"hdfs dfs -rm -r /spark-result/covid/df")
spark_result = run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/spark_scripts/covid_df.py")
merge_results("/spark-result/covid/df")
print(f"Took {get_time(spark_result)}ms")
print_hdfs_output("/spark-result/covid/df")

Took 14348ms
date,location,total_cases,new_cases,total_deaths,new_deaths,new_cases_per_million,average_new_cases_per_million

2020-01-03,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-04,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-05,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-06,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-07,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-08,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-09,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-10,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-11,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-12,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-13,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-14,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-15,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-16,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-17,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-18,Afghanistan,0,0,0,0,0,160.00584439903545

2020-01-19,Afghanistan,0,0

In [4]:
spark_result = run_in_master("spark-submit \
--master yarn \
--deploy-mode cluster \
--class covid01.Main \
/data/master_volume/spark_scripts/spark.jar ")
merge_results("/spark-result/covid01")
print(f"Took {get_time(spark_result)}ms")
print_hdfs_output("/spark-result/covid01")

spark history server has not updated (yet)
Took 16142ms
{"date":"2020-01-03","location":"Afghanistan","total_cases":0,"new_cases":0,"total_deaths":0,"new_deaths":0,"new_cases_per_million":0.0,"average_new_cases_per_million":160.29720723141685}

{"date":"2020-01-04","location":"Afghanistan","total_cases":0,"new_cases":0,"total_deaths":0,"new_deaths":0,"new_cases_per_million":0.0,"average_new_cases_per_million":160.29720723141685}

{"date":"2020-01-05","location":"Afghanistan","total_cases":0,"new_cases":0,"total_deaths":0,"new_deaths":0,"new_cases_per_million":0.0,"average_new_cases_per_million":160.29720723141685}

{"date":"2020-01-06","location":"Afghanistan","total_cases":0,"new_cases":0,"total_deaths":0,"new_deaths":0,"new_cases_per_million":0.0,"average_new_cases_per_million":160.29720723141685}

{"date":"2020-01-07","location":"Afghanistan","total_cases":0,"new_cases":0,"total_deaths":0,"new_deaths":0,"new_cases_per_million":0.0,"average_new_cases_per_million":160.29720723141685}


# Steam

In [5]:
run_in_master(f"hdfs dfs -rm -r /spark-result/steam/df")
spark_result = run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/spark_scripts/steam_df.py")
merge_results("/spark-result/steam/df")
print(f"Took {get_time(spark_result)}ms")
print_hdfs_output("/spark-result/steam/df")

Took 14608ms
steam_appid,coming_soon,date,appid,name,positive,negative,owners,ccu

907680,false,"Aug 17, 2018",907680,Wwbit,710,566,"100,000 .. 200,000",0

823550,false,"Sep 18, 2018",823550,Booty Calls,255,294,"100,000 .. 200,000",134

639780,false,"Dec 7, 2017",639780,Deep Space Waifu: FLAT JUSTICE,1449,67,"50,000 .. 100,000",4

steam_appid,coming_soon,date,appid,name,positive,negative,owners,ccu

896890,false,"Dec 23, 2019",896890,VR Paradise - Steam Edition,138,50,"20,000 .. 50,000",11

726360,false,"May 22, 2020",726360,BOOBS SAGA: Prepare To Hentai Edition,367,103,"20,000 .. 50,000",0

723090,false,"Oct 24, 2017",723090,Meltys Quest,446,10,"20,000 .. 50,000",36

712790,false,"Oct 2, 2017",712790,Crimson Memories,51,21,"20,000 .. 50,000",0

825300,false,"Nov 22, 2018",825300,To Trust an Incubus,43,5,"0 .. 20,000",1

937730,false,"Dec 24, 2018",937730,Lady's Hentai Mosaic,21,4,"0 .. 20,000",0

588920,false,"Apr 7, 2017",588920,BADASS,15,14,"0 .. 20,000",0

555310,false,"Feb 23, 201

In [6]:
spark_result = run_in_master("spark-submit \
--master yarn \
--deploy-mode cluster \
--class steam01.Main \
/data/master_volume/spark_scripts/spark.jar ")
merge_results("/spark-result/steam01")
print(f"Took {get_time(spark_result)}ms")
print_hdfs_output("/spark-result/steam01")

spark history server has not updated (yet)
Took 17047ms
{"key":"555310","value":{"game_id":555310,"name":"Satellite","positive":44,"negative":9,"owners":"0 .. 20,000","ccu":0,"release_date":"Feb 23, 2018"}}

{"key":"639780","value":{"game_id":639780,"name":"Deep Space Waifu: FLAT JUSTICE","positive":1449,"negative":67,"owners":"50,000 .. 100,000","ccu":4,"release_date":"Dec 7, 2017"}}

{"key":"677730","value":{"game_id":677730,"name":"Karmasutra","positive":13,"negative":6,"owners":"0 .. 20,000","ccu":0,"release_date":"Sep 29, 2017"}}

{"key":"712790","value":{"game_id":712790,"name":"Crimson Memories","positive":51,"negative":21,"owners":"20,000 .. 50,000","ccu":0,"release_date":"Oct 2, 2017"}}

{"key":"823550","value":{"game_id":823550,"name":"Booty Calls","positive":255,"negative":294,"owners":"100,000 .. 200,000","ccu":134,"release_date":"Sep 18, 2018"}}

{"key":"868980","value":{"game_id":868980,"name":"DEEP SPACE WAIFU: NEKOMIMI","positive":362,"negative":6,"owners":"0 .. 20,000"