# How to run commands on hadoop using jupyter

## Import utility functions
run_in_master runs provided command on namenode. Use it to start map-reduce, pig or spark application. 

run_in_hive runs provided command on hive-server. Use it to start hive application

In [1]:
from utils import run_in_master, run_in_hive, print_hdfs_output, hdfs_upload

In [2]:
print(run_in_master("echo 'test'"))
print(run_in_hive("echo 'test'"))

(['test\n'], [])
(['test\n'], [])


# Map reduce

In [3]:
run_in_master("yarn jar /opt/hadoop-3.3.1/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.1.jar pi 2 5")

(['Number of Maps  = 2\n',
  'Samples per Map = 5\n',
  'Wrote input for Map #0\n',
  'Wrote input for Map #1\n',
  'Starting Job\n',
  'Job Finished in 11.888 seconds\n',
  'Estimated value of Pi is 3.60000000000000000000\n'],
 ['2023-06-19 22:19:40,819 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at resourcemanager/172.19.0.9:8032\n',
  '2023-06-19 22:19:40,882 INFO client.AHSProxy: Connecting to Application History server at historyserver/172.19.0.12:10200\n',
  '2023-06-19 22:19:40,950 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1687211407465_0006\n',
  '2023-06-19 22:19:41,011 INFO input.FileInputFormat: Total input files to process : 2\n',
  '2023-06-19 22:19:41,059 INFO mapreduce.JobSubmitter: number of splits:2\n',
  '2023-06-19 22:19:41,132 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1687211407465_0006\n',
  '2023-06-19 22:19:41,132 INFO mapreduce.JobSubmitter

# Pig

In [4]:
hdfs_upload("examples/pig.pig")
hdfs_upload("examples/data.jsonl")

hdfs dfs -put /data/master_volume/examples/pig.pig /examples
exit code []
["put: `/examples/pig.pig': File exists\n"]
hdfs dfs -put /data/master_volume/examples/data.jsonl /examples
exit code []
["put: `/examples/data.jsonl': File exists\n"]


In [5]:
run_in_master("pig -x mapreduce /data/master_volume/examples/pig.pig")

(['(Michał,24)\n'],
 ['2023-06-19 22:19:58,960 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL\n',
  '2023-06-19 22:19:58,961 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE\n',
  '2023-06-19 22:19:58,961 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType\n',
  '2023-06-19 22:19:59,000 [main] INFO  org.apache.pig.Main - Apache Pig version 0.17.0 (r1797386) compiled Jun 02 2017, 15:41:58\n',
  '2023-06-19 22:19:59,000 [main] INFO  org.apache.pig.Main - Logging error messages to: /app/pig_1687213198994.log\n',
  '2023-06-19 22:19:59,210 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file /root/.pigbootup not found\n',
  '2023-06-19 22:19:59,248 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n',
  '2023-06-19 22:19:59,248 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: hdfs://namenode:9000\n',
  '20

# Hive

In [6]:
hdfs_upload("examples/employee.csv")

hdfs dfs -put /data/master_volume/examples/employee.csv /examples
exit code []
["put: `/examples/employee.csv': File exists\n"]


In [7]:
run_in_hive("hive -f /data/master_volume/examples/employee_table.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = 9edf19df-48de-4b88-9cce-0087788a6255\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = a537331a-ef44-46f5-a09e-1e13291a4624\n',
  'OK\n',
  'Time taken: 0.406 seconds\n',
  'OK\n',
  'Time taken: 0.015 seconds\n',
  'OK\n',
  'Time taken: 0.066 seconds\n'])

In [8]:
run_in_hive("hive -f /data/master_volume/examples/employee_load.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = 114f4db2-199b-43dc-8fd2-b0c26dbba9d2\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = c67d3b63-2821-46bc-a683-69c3f31fa3c1\n',
  'OK\n',
  'Time taken: 0.403 seconds\n',
  'Loading data to table testdb.employee\n',
  'OK\n',
  'Time taken: 0.468 seconds\n'])

In [9]:
run_in_hive("hive -f /data/master_volume/examples/test_group.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = 14346eba-3f1f-400b-ab2e-4cc175bee879\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = 3a5dd44e-3f06-4691-b275-0c936e1eda99\n',
  'OK\n',
  'Time taken: 0.396 seconds\n',
  'Query ID = root_20230619222025_f4ccd1bd-b446-44b3-8c2e-c330202b6f88\n',
  'Total jobs = 1\n',
  'Launching Job 1 out of 1\n',
  'Number of reduce tasks not specified. Estimated from input data size: 1\n',

In [10]:
print_hdfs_output("/user/hive/warehouse/results/000000_0")

8824933

14555152

13918264

22224443

83793498

77255506

92432520

23576545

37365571

26259585



# Spark

In [11]:
run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/examples/spark.py")

([],
 ['23/06/19 22:21:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n',
  '23/06/19 22:21:43 INFO DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at resourcemanager/172.19.0.9:8032\n',
  '23/06/19 22:21:43 INFO AHSProxy: Connecting to Application History server at historyserver/172.19.0.12:10200\n',
  '23/06/19 22:21:43 INFO Configuration: resource-types.xml not found\n',
  "23/06/19 22:21:43 INFO ResourceUtils: Unable to find 'resource-types.xml'.\n",
  '23/06/19 22:21:43 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (4096 MB per container)\n',
  '23/06/19 22:21:43 INFO Client: Will allocate AM container, with 1408 MB memory including 384 MB overhead\n',
  '23/06/19 22:21:43 INFO Client: Setting up container launch context for our AM\n',
  '23/06/19 22:21:43 INFO Client: Setting up the launch environment for our AM conta

In [None]:
print_hdfs_output("/user/hive/warehouse/results/000000_0")