In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

##### Creating spark session

In [None]:
spark = SparkSession.builder.appName("WordCount").master('local[*]').getOrCreate()

In [2]:
df = spark.read.csv("./data/wordcount.txt")

In [3]:
df.show()

+--------------------+
|                 _c0|
+--------------------+
|hadoop spark scal...|
|hadoop scala flum...|
|mapreduce hadoop ...|
|spark scala trans...|
|spark scala rdd t...|
|hadoop spark scal...|
|hadoop scala flum...|
|mapreduce hadoop ...|
|spark scala trans...|
|spark scala rdd t...|
|hadoop spark scal...|
|hadoop scala flum...|
|mapreduce hadoop ...|
|spark scala trans...|
+--------------------+



#### Counting words in the text file

In [4]:
df1 = df.rdd.flatMap(lambda x:x[0].split(" "))

In [6]:
print(df1.collect())

['hadoop', 'spark', 'scala', 'flatmap', 'map', 'groupby', 'spark', 'spark', 'hadoop', 'scala', 'flume', 'oozie', 'sqoop', 'hive', 'hive', 'spark', 'spark', 'mapreduce', 'hadoop', 'hive', 'hadoop', 'hdfs', 'flatmap', 'rdd', 'scala', 'spark', 'scala', 'transformations', 'actions', 'rdd', 'rdd', 'rdd', 'rdd', 'spark', 'scala', 'rdd', 'transformations', 'actions', 'rdd', 'rdd', 'rdd', 'hadoop', 'spark', 'scala', 'flatmap', 'map', 'groupby', 'spark', 'spark', 'hadoop', 'scala', 'flume', 'oozie', 'sqoop', 'hive', 'hive', 'spark', 'spark', 'mapreduce', 'hadoop', 'hive', 'hadoop', 'hdfs', 'flatmap', 'rdd', 'scala', 'spark', 'scala', 'transformations', 'actions', 'rdd', 'rdd', 'rdd', 'rdd', 'spark', 'scala', 'rdd', 'transformations', 'actions', 'rdd', 'rdd', 'rdd', 'hadoop', 'spark', 'scala', 'flatmap', 'map', 'groupby', 'spark', 'spark', 'hadoop', 'scala', 'flume', 'oozie', 'sqoop', 'hive', 'hive', 'spark', 'spark', 'mapreduce', 'hadoop', 'hive', 'hadoop', 'hdfs', 'flatmap', 'rdd', 'scala', 's

In [7]:
df2 = df1.map(lambda word:(word, 1)).reduceByKey(lambda a,b: a+b)

In [8]:
df2.collect()

[('hadoop', 12),
 ('spark', 20),
 ('scala', 14),
 ('flatmap', 6),
 ('map', 3),
 ('groupby', 3),
 ('flume', 3),
 ('oozie', 3),
 ('sqoop', 3),
 ('hive', 9),
 ('mapreduce', 3),
 ('hdfs', 3),
 ('rdd', 23),
 ('transformations', 5),
 ('actions', 5)]

In [20]:
schema = StructType([
    StructField("Word", StringType(), False),
    StructField("Freq", IntegerType(),False)
])
data_freq = spark.createDataFrame(df2, schema=schema)

In [21]:
data_freq.show()

+---------------+----+
|           Word|Freq|
+---------------+----+
|         hadoop|  12|
|          spark|  20|
|          scala|  14|
|        flatmap|   6|
|            map|   3|
|        groupby|   3|
|          flume|   3|
|          oozie|   3|
|          sqoop|   3|
|           hive|   9|
|      mapreduce|   3|
|           hdfs|   3|
|            rdd|  23|
|transformations|   5|
|        actions|   5|
+---------------+----+



#### filter out all words that occurs less than a given threshold (taken 3 as threshold)

In [28]:
filtered_df2 = df2.filter(lambda a : a[1] > 3)
filtered_df2.collect()

[('hadoop', 12),
 ('spark', 20),
 ('scala', 14),
 ('flatmap', 6),
 ('hive', 9),
 ('rdd', 23),
 ('transformations', 5),
 ('actions', 5)]

#### counting letter freq from filtered RDD

In [39]:
df3=  filtered_df2.map(lambda a: a[0])

In [40]:
letters = df3.flatMap(lambda x : list(x))

In [41]:
letters_freq = letters.map(lambda x: (x,1)).reduceByKey(lambda a,b : a+b)

In [42]:
letters_freq.collect()

[('h', 2),
 ('a', 9),
 ('d', 3),
 ('o', 5),
 ('p', 3),
 ('s', 5),
 ('r', 4),
 ('k', 1),
 ('c', 2),
 ('l', 2),
 ('f', 2),
 ('t', 4),
 ('m', 2),
 ('i', 3),
 ('v', 1),
 ('e', 1),
 ('n', 3)]

In [43]:
schema = StructType([
    StructField("Letter", StringType(), False),
    StructField("Freq", IntegerType(),False)
])
letters_freq_df = spark.createDataFrame(letters_freq, schema=schema)

In [44]:
letters_freq_df.show()

+------+----+
|Letter|Freq|
+------+----+
|     h|   2|
|     a|   9|
|     d|   3|
|     o|   5|
|     p|   3|
|     s|   5|
|     r|   4|
|     k|   1|
|     c|   2|
|     l|   2|
|     f|   2|
|     t|   4|
|     m|   2|
|     i|   3|
|     v|   1|
|     e|   1|
|     n|   3|
+------+----+



#### Ending Spark Session

In [45]:
spark.stop()