In [13]:
## Los datos son leidos del sistema HDFS de Hadoop.
## Los resultdos son guardados en una carpeta del sistema Hadoop.
## El script se almacena en un archivo en el disco duro, para su uso posterior.

In [14]:
## Copia Archivos a sistema HDFS
##
## Se usan un directorio temporal en el HDFS. La siguiente
## instrucción muestra el contenido del dicho directorio
##
!hdfs dfs -ls /tmp

Found 3 items
drwxrwx---   - root supergroup          0 2019-12-13 19:19 /tmp/hadoop-yarn
drwxrwxrwx   - root supergroup          0 2019-12-13 19:30 /tmp/hive
drwxr-xr-x   - root supergroup          0 2019-12-13 20:15 /tmp/wordcount


In [18]:
##
## Crea la carpeta wordcount en el hdfs
##
!hdfs dfs -mkdir /tmp/wordcount

mkdir: `/tmp/wordcount': File exists


In [19]:
##
## Verifica la creación de la carpeta
##
!hdfs dfs -ls /tmp/

Found 3 items
drwxrwx---   - root supergroup          0 2019-12-13 19:19 /tmp/hadoop-yarn
drwxrwxrwx   - root supergroup          0 2019-12-13 19:30 /tmp/hive
drwxr-xr-x   - root supergroup          0 2019-12-13 20:15 /tmp/wordcount


In [20]:
##
## Copia los archvios del directorio local wordcount/
## al directorio /tmp/wordcount/ en el hdfs
##
!hdfs dfs -copyFromLocal wordcount/*  /tmp/wordcount/

In [21]:
##
## Verifica que los archivos esten copiados
## en el hdfs
##
!hdfs dfs -ls /tmp/wordcount

Found 3 items
-rw-r--r--   1 root supergroup       1082 2019-12-13 20:41 /tmp/wordcount/text0.txt
-rw-r--r--   1 root supergroup        349 2019-12-13 20:41 /tmp/wordcount/text1.txt
-rw-r--r--   1 root supergroup        435 2019-12-13 20:41 /tmp/wordcount/text2.txt


In [22]:
####################################################################################################################
## Generación del script y ajuste del código

In [23]:
%%writefile wordcount.hql

DROP TABLE IF EXISTS docs;
DROP TABLE IF EXISTS word_counts;

CREATE TABLE docs (line STRING);

LOAD DATA INPATH "/tmp/wordcount/" OVERWRITE INTO TABLE docs;

CREATE TABLE word_counts
AS
    SELECT word, count(1) AS count
    FROM
        (SELECT explode(split(line, '\\s')) AS word FROM docs) w
GROUP BY
    word
ORDER BY
    word;

INSERT OVERWRITE DIRECTORY '/tmp/output'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT * FROM word_counts;


Overwriting wordcount.hql


In [24]:
## Ejecucion
!hive -S -e 'source wordcount.hql'

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hive/lib/log4j-slf4j-impl-2.6.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]


In [25]:
## Donde -S indica que Hive se ejecute en modo silencioso; -e que se ejecute la expresión source wordcount.hql. Lo anterior es equivalente a abrir Hive y luego ejecutar:
##      hive> source 'wordcount.hql'

In [26]:
## Visualización de los resultados

In [27]:
## Se lista el contenido del archivo.
!hdfs dfs -ls /tmp/output

Found 1 items
-rwxrwxrwx   1 root supergroup       1652 2019-12-13 20:42 /tmp/output/000000_0


In [28]:
## se visualiza la cabecera del archivo.
!hdfs dfs -cat /tmp/output/000000_0 | head

,1
(DA),1
(see,1
Analytics,2
Analytics,,1
Big,1
Data,3
Especially,1
Organizations,1
Since,1


In [29]:
## Copia de los resultados a la maquina local (vagrant)

In [30]:
!hadoop fs -copyToLocal /tmp/output output
!ls output/*

output/000000_0


In [31]:
!cat output/000000_0

,1
(DA),1
(see,1
Analytics,2
Analytics,,1
Big,1
Data,3
Especially,1
Organizations,1
Since,1
Specifically,,1
The,2
a,1
about,1
aid,1
algorithms,1
analysis,,1
analysis.,1
analytics,8
analytics,,8
analytics.,1
analyze,1
and,15
application,1
apply,1
are,1
areas,2
assortment,1
be,1
big,1
business,4
by,2
call,1
can,2
certain,1
changes.,1
cognitive,1
commercial,1
communication,1
computation,1
computer,2
conclusions,1
contain,,1
credit,1
current,1
data,4
data),,1
data.,1
decision,1
decisions,2
describe,,1
descriptive,1
discovery,,1
disprove,1
draw,1
effects,1
enable,1
enterprise,1
evaluate,1
events,,1
examining,1
extensive,1
field,1
for,1
force,1
fraud,1
gaining,1
given,1
goal,1
harness,1
historical,1
hypotheses.,1
improve,2
improvements,1
in,5
include,1
increasingly,1
industries,1
information,1
information,,1
interpretation,,1
involves,1
is,3
knowledge,1
make,2
management,,1
marketing,2
mathematics.,1
may,1
meaningful,1
methods,1
mix,1
modeling,,2
models,,1
more-informed,1
most,1
of,8
often,1

In [None]:
## Otra forma de extraer los resultados es: $ hive -S -e 'SELECT * FROM word_counts;' > result.csv

In [32]:
!rm -rf output wordcount *.log