# Word Count Problem using Pyspark to work with different file format inputs

In [2]:
import os
os.environ["SPARK_LOCAL_IP"] = "10.0.2.15"


In [1]:
import findspark
findspark.init()

# Reading Text file and performing word count problem
 

In [21]:
from operator import add
from pyspark.sql import SparkSession
from pyspark.sql.functions import split

spark = SparkSession.builder.master("local").appName('Firstprogram').getOrCreate()

# Read the input file and create DataFrame
df = spark.read.text("/home/hdoop/Documents/python/pyspark/input1.txt")

lines = df.rdd.map(lambda r:r[0])

# Split lines into words and perform word count
counts = lines.flatMap(lambda line: line.split(" ")) \
                            .map(lambda word: (word, 1)) \
                            .reduceByKey(add)
# Printing each word with its respective count
output = counts.collect()

for (word,count) in output:
    print ("%s:%i" % (word,count))

# Stop SparkSession
spark.stop()


this:1
is:3
a:2
sample:1
input:1
text:2
file:3
for:1
wordcount:2
program:2
being:1
implemented:1
using:1
pyspark:1
will:1
be:1
stored:1
on:1
hdfs:2
distributed:1
system:1


# Reading CSV file and performing word count problem

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

# Create SparkSession
csv_spark = SparkSession.builder.appName("CSVReader").getOrCreate()

# File path
csv_file_path = "data_input.csv"

# Read CSV file
df = csv_spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show()

text_column = 'Name'

df = df.select(text_column)

# Split the text into words
words_df = df.select(explode(split(df[text_column], " ")).alias("word"))

word_counts = words_df.groupBy("word").count()

# Show the word count results
word_counts.show()

# Stop SparkSession
csv_spark.stop()



                                                                                

+--------+
|    Name|
+--------+
| shubham|
|  shirke|
|shreyash|
|   hello|
|   world|
|       I|
|      am|
|  world |
|      to|
|   hello|
+--------+



                                                                                

+--------+-----+
|    word|count|
+--------+-----+
|shreyash|    1|
|   hello|    2|
| shubham|    1|
|  shirke|    1|
|   world|    2|
|       I|    1|
|        |    1|
|      am|    1|
|      to|    1|
+--------+-----+



# Read Json file and perfrom word count Problem

In [2]:
# Import Libraries
from pyspark.sql import SparkSession

from pyspark.sql.functions import to_json ,explode,split


json_spark = SparkSession.builder.appName('JsonReader').getOrCreate()

json_file_path = "json_input.json"
df = json_spark.read.json(json_file_path)

df.printSchema()
df.show()

text_column = 'word'

df = df.select(text_column)

words_df = df.select(explode(split(df[text_column], " ")).alias("word"))

word_counts = words_df.groupBy("word").count()


#word_counts.show()
output = word_counts.toJSON().collect()

print("Output In Json Format")
for i in output:
    print(i)



json_spark.stop()

root
 |-- word: string (nullable = true)

+----------+
|      word|
+----------+
|     apple|
|    banana|
|    cherry|
|      date|
|elderberry|
|    banana|
|elderberry|
|     guaua|
|     apple|
|    orange|
+----------+

Output In Json Format
{"word":"guaua","count":1}
{"word":"orange","count":1}
{"word":"elderberry","count":2}
{"word":"apple","count":2}
{"word":"cherry","count":1}
{"word":"banana","count":2}
{"word":"date","count":1}
