"""<br>
    @Author: Deven Gupta<br>
    @Date: 3-09-2024<br>
    @Last Modified by: Deven Gupta<br>
    @Last Modified time: 3-09-2024<br>
    @Title : Python program in pyspark for wordcount using multiple file format<br>
<br>
"""<br>

In [23]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('pyspark').master("local[*]").getOrCreate()
from pyspark.sql.functions import explode, split, col, concat_ws, lower

In [7]:
spark

## <Center><p style="background:white ; color:blue ; font-weight:bold">Wordcount using Text file</p></Center>

In [8]:
df_text=spark.read.text("Files/input.txt")
df_text.show()


words_df = df_text.select(explode(split(col("value"), " ")).alias("word"))
word_counts_df = words_df.groupBy("word").count()
word_counts_df.show()


+--------------------+
|               value|
+--------------------+
|Hello my name is ...|
|my brother name i...|
|and my friends na...|
+--------------------+

+-------+-----+
|   word|count|
+-------+-----+
| aayush|    1|
|   name|    3|
|  sujal|    1|
|friends|    1|
| prayag|    1|
|      .|    1|
|     is|    2|
|   shiv|    1|
|  Hello|    1|
|     my|    3|
|    and|    2|
|    are|    1|
|  deven|    1|
|      ,|    1|
|brother|    1|
|       |    1|
+-------+-----+



##   <Center><p style="background:white ; color:blue ; font-weight:bold">WordCount using CSV File</p></Center>

In [27]:
df_csv=spark.read.csv("Files/input.csv",header=True,inferSchema=True)
df_csv.printSchema()
df_csv.show()


combined_df = df_csv.withColumn("combined_text", concat_ws(" ", col("text1"), col("text2")))
combined_df.show()

words_df = combined_df.select(explode(split(lower(col("combined_text")), " ")).alias("word"))
word_counts_df = words_df.groupBy("word").count()
word_counts_df.show()



root
 |-- id: integer (nullable = true)
 |-- text1: string (nullable = true)
 |-- text2: string (nullable = true)
 |-- author: string (nullable = true)

+---+--------------------+--------------------+------+
| id|               text1|               text2|author|
+---+--------------------+--------------------+------+
|  1|This is a sample ...|Another part of t...| Alice|
|  2|      More text here|     Additional text|   Bob|
+---+--------------------+--------------------+------+

+---+--------------------+--------------------+------+--------------------+
| id|               text1|               text2|author|       combined_text|
+---+--------------------+--------------------+------+--------------------+
|  1|This is a sample ...|Another part of t...| Alice|This is a sample ...|
|  2|      More text here|     Additional text|   Bob|More text here Ad...|
+---+--------------------+--------------------+------+--------------------+

+----------+-----+
|      word|count|
+----------+-----+
| 

## <Center><p style="background:white ; color:blue ; font-weight:bold">Wordcount using JSON</p></Center>

In [20]:
df_json = spark.read.json("Files/input.json")
df_json.printSchema()
df_json.show()

words_df = df_json.select(explode(split((col("text")), " ")).alias("word"))
word_counts_df = words_df.groupBy("word").count()
word_counts_df.show()

root
 |-- _corrupt_record: string (nullable = true)
 |-- text: string (nullable = true)

+---------------+--------------------+
|_corrupt_record|                text|
+---------------+--------------------+
|              [|                NULL|
|           NULL|Hello my name is ...|
|              ]|                NULL|
+---------------+--------------------+

+-------+-----+
|   word|count|
+-------+-----+
| aayush|    1|
|   name|    3|
|  sujal|    1|
|friends|    1|
| prayag|    1|
|      .|    1|
|     is|    2|
|   shiv|    1|
|  Hello|    1|
|     my|    3|
|    and|    2|
|    are|    1|
|  deven|    1|
|      ,|    1|
|brother|    1|
|       |    1|
+-------+-----+



In [28]:
spark.stop()