#### Importing Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

#### Createing Spark Session

In [2]:
spark = SparkSession.builder.appName("top_max_len_words").master('local[*]').getOrCreate()

In [3]:
df = spark.read.csv("./data/wordcount.txt")

In [4]:
df.show()

+--------------------+
|                 _c0|
+--------------------+
|hadoop spark scal...|
|hadoop scala flum...|
|mapreduce hadoop ...|
|spark scala trans...|
|spark scala rdd t...|
|hadoop spark scal...|
|hadoop scala flum...|
|mapreduce hadoop ...|
|spark scala trans...|
|spark scala rdd t...|
|hadoop spark scal...|
|hadoop scala flum...|
|mapreduce hadoop ...|
|spark scala trans...|
+--------------------+



####  filter out the top 5 words with maximum length

In [9]:
df1 = df.rdd.flatMap(lambda x:x[0].split(" ")).distinct()

In [10]:
print(df1.collect())

['hadoop', 'spark', 'scala', 'flatmap', 'map', 'groupby', 'flume', 'oozie', 'sqoop', 'hive', 'mapreduce', 'hdfs', 'rdd', 'transformations', 'actions']


In [11]:
df2 = df1.map(lambda word:(word, len(word)))

In [12]:
print(df2.collect())

[('hadoop', 6), ('spark', 5), ('scala', 5), ('flatmap', 7), ('map', 3), ('groupby', 7), ('flume', 5), ('oozie', 5), ('sqoop', 5), ('hive', 4), ('mapreduce', 9), ('hdfs', 4), ('rdd', 3), ('transformations', 15), ('actions', 7)]


In [15]:
sorted_by_length = df2.sortBy(lambda x : x[1], False)

In [16]:
sorted_by_length.collect()

[('transformations', 15),
 ('mapreduce', 9),
 ('flatmap', 7),
 ('groupby', 7),
 ('actions', 7),
 ('hadoop', 6),
 ('spark', 5),
 ('scala', 5),
 ('flume', 5),
 ('oozie', 5),
 ('sqoop', 5),
 ('hive', 4),
 ('hdfs', 4),
 ('map', 3),
 ('rdd', 3)]

In [17]:
top_five_max_len_words = sorted_by_length.top(5)

In [18]:
top_five_max_len_words

[('transformations', 15), ('sqoop', 5), ('spark', 5), ('scala', 5), ('rdd', 3)]

In [20]:
schema = StructType([
    StructField("Word", StringType(), False),
    StructField("Length", IntegerType(),False)
])

In [22]:
result = spark.createDataFrame(top_five_max_len_words, schema=schema)

In [23]:
result.show()

+---------------+------+
|           Word|Length|
+---------------+------+
|transformations|    15|
|          sqoop|     5|
|          spark|     5|
|          scala|     5|
|            rdd|     3|
+---------------+------+



#### Ending Session

In [24]:
spark.stop()