Skip to content
Browse files

adding Hive example

  • Loading branch information...
1 parent f27e7df commit 9d50c958d9009a7bd40bad0fa42aaf0c188123cf @ceteri ceteri committed Oct 16, 2012
Showing with 25 additions and 6 deletions.
  1. +11 −6 part2/src/scripts/wc.pig
  2. +14 −0 part2/src/scripts/wc.sql
View
17 part2/src/scripts/wc.pig
@@ -1,15 +1,20 @@
-docPipe = LOAD '$docPath' USING PigStorage('\t', 'tagsource') AS (doc_id, text);
+-- kudos to Dmitriy Ryaboy
+
+docPipe = LOAD '$docPath'
+ USING PigStorage('\t', 'tagsource') AS (doc_id, text);
docPipe = FILTER docPipe BY doc_id != 'doc_id';
--- specify a regex operation to split the "document" text lines into a token stream
-tokenPipe = FOREACH docPipe GENERATE doc_id, FLATTEN(TOKENIZE(text, ' [](),.')) AS token;
+-- specify a regex to split "document" text lines into token stream
+tokenPipe = FOREACH docPipe
+ GENERATE doc_id, FLATTEN(TOKENIZE(text, ' [](),.')) AS token;
tokenPipe = FILTER tokenPipe BY token MATCHES '\\w.*';
--- DUMP tokenPipe;
-- determine the word counts
tokenGroups = GROUP tokenPipe BY token;
-wcPipe = FOREACH tokenGroups GENERATE group AS token, COUNT(tokenPipe) AS count;
+wcPipe = FOREACH tokenGroups
+ GENERATE group AS token, COUNT(tokenPipe) AS count;
-- output
-STORE wcPipe INTO '$wcPath' using PigStorage('\t', 'tagsource');
+STORE wcPipe INTO '$wcPath'
+ USING PigStorage('\t', 'tagsource');
EXPLAIN -out dot/wc_pig.dot -dot wcPipe;
View
14 part2/src/scripts/wc.sql
@@ -0,0 +1,14 @@
+-- Steve Severance
+-- http://stackoverflow.com/questions/10039949/word-count-program-in-hive
+
+CREATE TABLE input (line STRING);
+
+LOAD DATA LOCAL INPATH 'input.tsv'
+OVERWRITE INTO TABLE input;
+
+SELECT
+ word, COUNT(*)
+FROM input
+ LATERAL VIEW explode(split(text, ' ')) lTable AS word
+GROUP BY word
+;

0 comments on commit 9d50c95

Please sign in to comment.
Something went wrong with that request. Please try again.