In [None]:
from pyspark import SparkContext,SparkConf
import time

# Stop the existing SparkContext (if it exists)
try:
    sc.stop()
except:
    pass
# create a SparkContext
# sc = SparkContext(appName="ComputePageSizeStats")
conf = SparkConf().setAppName("PageCounts")
sc = SparkContext(conf=conf)
# load the page view statistics data into an RDD
rdd = sc.textFile("pagecounts-20160101-000000_parsed.out")

In [None]:
#Problem 1 -> Get min, max, and average of page size

# start the timer
start_time = time.time()

# initialize variables
min_size = float('inf')
max_size = float('-inf')
total_size = 0
count = 0

# loop through each line in the RDD
for line in rdd.toLocalIterator():
    # parse the line and extract the page size
    fields = line.split()
    if len(fields) >= 4:
        try:
            size = int(fields[3])
            if size >= 0:
                # update min, max, and total size
                min_size = min(min_size, size)
                max_size = max(max_size, size)
                total_size += size
                count += 1
        except ValueError:
            # skip lines with invalid page size
            pass

# compute the average page size
if count > 0:
    avg_size = total_size / count
else:
    avg_size = 0

end_time = time.time()
elapsed_time = end_time - start_time


with open("sparkLoops.txt", "w",encoding="utf-8") as f:
    f.write("Spark Loops\n")
    f.write("---------------------- Problem 1 ------------------ \n")
    elapsed_time = "Elapsed time:"+str(elapsed_time)+" seconds \n"
    f.write(elapsed_time)
    f.write("Min page size: {}\n".format(min_size))
    f.write("Max page size: {}\n".format(max_size))
    f.write("Avg page size: {}\n".format(avg_size))
    f.write("______________________________________________________\n")

In [None]:
#Problem 2 -> Get the number of pages that start with "The" and are not part of the English project

# start the timer
start_time = time.time()

# initialize variables
num_the_pages = 0
num_non_en_pages = 0

# loop through each line in the RDD
for line in rdd.toLocalIterator():
    # split the line into fields
    fields = line.split()
    
    # check if the page title starts with "The"
    if fields[1].startswith("The_"):
        num_the_pages += 1
        
        # check if the page is not part of the English project
        if fields[0] != "en":
            num_non_en_pages += 1

# end the timer
end_time = time.time()
elapsed_time = end_time - start_time

with open("sparkLoops.txt", "a",encoding="utf-8") as f:
    f.write("---------------------- Problem 2 ------------------ \n")
    elapsed_time = "Elapsed time:"+str(elapsed_time)+" seconds \n"
    f.write(elapsed_time)
    f.write("Number of non-English pages starting with 'The': {}\n".format(num_non_en_pages))
    f.write("______________________________________________________\n")

In [None]:
#Problem 3 -> Get the number of unique terms in the page titles

# start the timer
start_time = time.time()
# initialize variables
unique_terms = set()

# loop through each line in the RDD
for line in rdd.toLocalIterator():
    # extract the page title and split it into terms
    title = line.split()[1]
    terms = title.split('_')
    
    # add each term to the set of unique terms
    for term in terms:
        unique_terms.add(term)

# count the number of unique terms
num_terms = len(unique_terms)

# end the timer
end_time = time.time()
elapsed_time = end_time - start_time

with open("sparkLoops.txt", "a",encoding="utf-8") as f:
    f.write("---------------------- Problem 3 ------------------ \n")
    elapsed_time = "Elapsed time:"+str(elapsed_time)+" seconds \n"
    f.write(elapsed_time)
    f.write("Number of unique  terms in the page titles :{}\n".format(num_terms))
    f.write("______________________________________________________\n")

In [None]:
#Problem 4 -> Get how many times a certain title appears in the dataset

# Stop the existing SparkContext (if it exists)
try:
    sc.stop()
except:
    pass

# Configure and initialize Spark
conf = SparkConf().setAppName("PageStatistics")
sc = SparkContext(conf=conf)

# start the timer
start_time = time.time()

# Create a SparkSession
page_views = sc.textFile("pagecounts-20160101-000000_parsed.out")

title_counts = {}
# Loop over each line in the page view statistics
for line in page_views.toLocalIterator():
    # Split the line into fields
    fields = line.split(" ")
    # Extract the title for the current page
    title = fields[1]
    views = 1
    # If the title is not in the dictionary, set views to 1
    if title not in title_counts:
        views = 1
    # If the title is already in the dictionary, increase views by 1
    else:
        views = title_counts[title] + 1
    # Update the page views for the current title
    title_counts[title] = views

# end the timer
end_time = time.time()
elapsed_time = end_time - start_time

# Write the title counts to a file
with open("sparkLoops.txt", "a",encoding="utf-8") as f:
        f.write("---------------------- Problem 4 ------------------ \n")
        elapsed_time = "Elapsed time:"+str(elapsed_time)+" seconds \n"
        f.write(elapsed_time)
        f.write("Number of times a certain title appears in the dataset :\n")
        for title, count in title_counts.items():
            f.write("{}: {}\n".format(title, count))
        f.write("______________________________________________________\n")
# Stop the SparkSession
sc.stop()

In [None]:
#Problem 5 -> Get pairs of pages with the same title and combination of pairs

try:
    sc.stop()
except:
    pass

conf = SparkConf().setAppName("PageCounts")
sc = SparkContext(conf=conf)

# Start the timer
start_time = time.time()

lines = sc.textFile("pagecounts-20160101-000000_parsed.out")
parsed_data = []

# Using a loop to split each line into fields
for line in lines.toLocalIterator():
    fields = line.split(" ")
    if len(fields) >= 4:
        parsed_data.append(fields)

converted_data = []

# Using a loop to convert the data types and extract relevant fields
for fields in parsed_data:
    converted_data.append((fields[0], fields[1], int(fields[2]), int(fields[3])))

# Using a dictionary to combine the data by key
combined_data_dict = {}
for data in converted_data:
    key = data[1]
    if key in combined_data_dict:
        combined_data_dict[key].append((data[0],data[1], data[2], data[3]))
    else:
        combined_data_dict[key] = [(data[0],data[1], data[2], data[3])]

# # Using a loop to format the output data
# output_data = []
# for key, value in combined_data_dict.items():
#     output_data.append("Title: {}\n".format(key))
#     for item in value:
#         if len(item) >= 3:
#             output_data.append("{{{}}}\n".format("\t".join(map(str, item))))
# Using a loop to format the output data
output_data = []
for key, value in combined_data_dict.items():
    output_data.append("Title: {}\n".format(key))
    for i, item in enumerate(value):
        if len(item) >= 3:
            for j in range(i+1, len(value)):
                if len(value[j]) >= 3:
                    pair = [item, value[j]]
                    output_data.append("{{{}}}\n".format("\t".join(map(str, pair))))

# end the timer
end_time = time.time()
elapsed_time = end_time - start_time

with open("sparkLoops.txt", "a",encoding="utf-8") as f:
    f.write("---------------------- Problem 5 ------------------ \n")
    elapsed_time = "Elapsed time:"+str(elapsed_time)+" seconds \n"
    f.write(elapsed_time)   
    f.writelines(output_data)

# Stop the Spark context to release resources
sc.stop()