"""<br>
    @Author: Deven Gupta<br>
    @Date: 4-09-2024<br>
    @Last Modified by: Deven Gupta<br>
    @Last Modified time: 4-09-2024<br>
    @Title : Perform Wordcount problem using sparkcontext<br>
<br>
"""

In [45]:
from pyspark import SparkContext,SparkConf

In [46]:
conf=SparkConf().setAppName("WordCount")
sc= SparkContext(conf=conf)

In [47]:
sc

## <Center><h3 style="background:white;color:blue;font-weight:bold">WordCount using Text file(SparkContext)</h3></Center>

In [5]:
# Read the input text file into an RDD
input_rdd = sc.textFile("/Files/file.txt")

In [7]:
input_rdd.collect()

['Hello my name is deven and my friend name is ayush and prayag and shiv']

In [8]:
# Split each line into words
words = input_rdd.flatMap(lambda line: line.split())

# Map each word to (word, 1)
word_pairs = words.map(lambda word: (word, 1))

# Reduce by key to count the occurrences of each word
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)


In [9]:
# Collect the results and print them
results = word_counts.collect()
for word, count in results:
    print(f"{word}: {count}")


Hello: 1
name: 2
is: 2
deven: 1
ayush: 1
prayag: 1
shiv: 1
my: 2
and: 3
friend: 1


## <Center><h3 style="background:white;color:blue;font-weight:bold">WordCount using CSV file (SparkContext)</h3></Center>



In [12]:
# Read the input csv file into an RDD
csv_rdd = sc.textFile("/Files/CRUD.csv")
csv_rdd.collect()

['EmployeeID,FirstName,LastName,Emp_Dept,Salary',
 '1,John,Doe,Comps,50000',
 '2,Jane,Smith,IT,60000',
 '3,Bob,Johnson,IT,55000',
 '4,Alice,Williams,Mech,70000',
 '5,Charlie,Brown,Comps,45000']

In [16]:
header = csv_rdd.first()  # Get the header line
data_rdd = csv_rdd.filter(lambda line: line != header)  # Filter out the header

# Split each line into words
words = data_rdd.flatMap(lambda line: line.split(","))

# Map each word to (word, 1)
word_pairs = words.map(lambda word: (word, 1))

# Reduce by key to count the occurrences of each word
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)


In [20]:
# Collect the results and print them
results = word_counts.collect()
for word, count in results:
    print(f"{word}: {count}")


1: 1
Doe: 1
Jane: 1
Smith: 1
60000: 1
Johnson: 1
55000: 1
4: 1
Williams: 1
Mech: 1
70000: 1
45000: 1
John: 1
Comps: 2
50000: 1
2: 1
IT: 2
3: 1
Bob: 1
Alice: 1
5: 1
Charlie: 1
Brown: 1


## <Center><h3 style="background:white;color:blue;font-weight:bold">WordCount using JSON file (SparkContext)</h3></Center>

In [72]:
import json

# Read JSON file as text RDD
json_rdd = sc.textFile("/Files/file.json")
json_rdd.collect()

['{"text":"Hello my name is Deven Hello my name is shiv Hello my name is ayush"}']

In [73]:
# Parse JSON lines into Python dictionaries
parsed_rdd = json_rdd.map(lambda line: json.loads(line))

In [74]:
# Extract text fields and split into words
texts = parsed_rdd.flatMap(lambda obj: obj['text'].split())

# Map each word to (word, 1)
word_pairs = texts.map(lambda word: (word, 1))

# Reduce by key to count the occurrences of each word
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

In [75]:
# Collect the results and print them
results = word_counts.collect()
for word, count in results:
    print(f"{word}: {count}")

Hello: 3
name: 3
is: 3
Deven: 1
shiv: 1
ayush: 1
my: 3


In [76]:
sc.stop()