# **Mount Drive**

In [32]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/Spark')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### In order for Python to find the Spark, download the findspark library and start it with findspark.init() function.

In [33]:
# !pip install pyspark

### In order to work with RDDs, we need to create a SparkContext.

In [34]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import time


## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.

## getOrCreate is used to create a SparkSession if not present.

In [35]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("CardCount")\
    .getOrCreate()

In [36]:
sc=spark.sparkContext

## Read Data - in.csv

In [37]:
veri_dosyasi="Names_Count.txt"

In [38]:
card_count_rdd = sc.textFile(veri_dosyasi)

In [39]:
# card_count_rdd.take(100)
card_count_rdd.take(5)

['Ahmed', 'Ali', 'Mohamed', 'Hussein', 'Hussein']

In [40]:
card_count_rdd.count()

4096

## Apply ReduceByKey

In [50]:
# Define the map function
def map_name_to_pair(name):
    return (name, 1)

# Define the reduce function
def reduce_counts(a, b):
    return a + b

# Map
map_start_time = time.time()
mapped_rdd = card_count_rdd.map(map_name_to_pair)
map_end_time = time.time()
# Calculate map operation time and format to 5 decimal places
map_time = (map_end_time - map_start_time) * 1000
print("Map time:", "{:.5f}".format(map_time), "ms")

# Reduce
reduce_start_time = time.time()
name_counts = mapped_rdd.reduceByKey(reduce_counts)
reduce_end_time = time.time()
# Calculate reduce operation time and format to 5 decimal places
reduce_time = (reduce_end_time - reduce_start_time) * 1000
print("Reduce time:", "{:.5f}".format(reduce_time), "ms")


print("Reduce time:", "{:.5f}".format(map_time + reduce_time), "ms")

Map time: 8.29220 ms
Reduce time: 82.71527 ms
Reduce time: 91.00747 ms


In [42]:
# Collect the results to the driver
results = name_counts.collect()

In [43]:
# Print the results
for name, count in results:
    print(f"{name}: {count}")

Ahmed: 3
Hussein: 1348
Amgad: 3
Zaki: 968
Salah: 3
Rami: 785
Ali: 980
Mohamed: 6


In [44]:
# Save the results to a text file
# output_directory = "./output"
# name_counts.saveAsTextFile(output_directory)

In [45]:
# Save in a single file
collected_data = name_counts.collect()
formatted_strings = [f"('{key}', {value})" for key, value in collected_data]
output_string = '\n'.join(formatted_strings)
output_file_path = './output.txt'
with open(output_file_path, 'w') as output_file:
    output_file.write(output_string)