In [1]:
import docker
import uuid
import paramiko
import os
from timeit import default_timer as timer
from dataclasses import dataclass
import re
import requests
import statistics
import pandas as pd
import numpy as np

In [2]:
def run_in_master(command):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect("namenode", username="root", password="pass")
    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(f"cd /app/ && . /env_var_path.sh && {command}")
    return (ssh_stdout.readlines(), ssh_stderr.readlines())

def run_in_hive(command):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect("hive-server", username="root", password="pass")
    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(f"bash -c '. /env_var_path.sh && {command}'")
    return (ssh_stdout.readlines(), ssh_stderr.readlines())

In [3]:
def setup_test_table():
    return run_in_hive("hive -f /data/master_volume/hive_scripts/employee_table.hql")
    
def copy_test_file():
    path = "/data/master_volume/hive_scripts/employee.csv"
    dest = "/user/hive/warehouse/testdb.db/employee"
    run_in_master(f"hdfs dfs -mkdir -p {dest}")
    return run_in_master(f"hdfs dfs -put {path} {dest}")

In [4]:
setup_test_table()
copy_test_file()

([], [])

In [5]:
run_in_hive("hive -f /data/master_volume/hive_scripts/test_group.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = b27f80a3-9b51-4c51-a01e-22347f36cc34\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = 72914f09-6a13-4243-8adc-a3fef52caa63\n',
  'OK\n',
  'Time taken: 0.429 seconds\n',
  'Query ID = root_20230526162420_93f2b8a0-e555-41b8-b81a-364bd085b3d8\n',
  'Total jobs = 1\n',
  'Launching Job 1 out of 1\n',
  'Number of reduce tasks not specified. Estimated from input data size: 1\n',

In [19]:
run_in_hive("hive -f /data/master_volume/hive_scripts/covid_table.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = 5f29bc9f-db5f-4ace-b4bb-b548ef97912a\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = fef67d91-ca49-4f6e-9367-907d00fc8e28\n',
  'OK\n',
  'Time taken: 0.449 seconds\n',
  'OK\n',
  'Time taken: 0.015 seconds\n',
  'OK\n',
  'Time taken: 0.031 seconds\n',
  'OK\n',
  'Time taken: 0.099 seconds\n'])

In [22]:
run_in_master(f"hdfs dfs -rm /user/hive/warehouse/testdb.db/covid/covid-dataset.csv")
run_in_master(f"hdfs dfs -cp /datasets/covid-dataset.csv /user/hive/warehouse/testdb.db/covid/covid-dataset.csv")

([], [])

In [24]:
run_in_hive("hive -f /data/master_volume/hive_scripts/covid_data.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = db6b5149-055d-46a7-9d8f-80e6d009fc75\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = 495f0c47-b20b-418a-b44a-0e5f83846f8b\n',
  'OK\n',
  'Time taken: 0.421 seconds\n',
  'Query ID = root_20230526171225_ad090ca2-6f31-4b40-8c49-f301dc979dc4\n',
  'Total jobs = 3\n',
  'Launching Job 1 out of 3\n',
  "Number of reduce tasks is set to 0 since there's no reduce operator\n",
  '