# Step1: Install Librarys

In [198]:
!pip install wikipedia
!pip install pyspark



# Step2: Import Librarys

In [199]:
import wikipedia
import pyspark
import os
import sys
import shutil
import re

# Step3: get wikipedia dataset

In [261]:
dataset1 = wikipedia.page('Python (programming language)').content
dataset2 = ["to be or not to be this is the problem"]

dataset1 = re.sub(r'[\n\t]', ' ', dataset1)
dataset1 = re.sub(r'\s+', ' ', dataset1.strip())
dataset1 = [dataset1]

# Step4: Define Ngram functions

In [262]:
def ngrams(line, N):
    words = line.split()
    ngrams = [tuple(words[i:i+N]) for i in range(len(words)-N+1)]
    return ngrams

def generate_ngrams(rdd, n):
    rdd = rdd.flatMap(lambda line: ngrams(line, n))\
             .map(lambda ngram: (ngram, 1))\
             .reduceByKey(lambda x, y: x + y)\
             .sortBy(lambda x: x[1], ascending=False)
    return rdd


# Step5: Set Paths

In [217]:
home = "/content/"
log = home + "log/"
output_dir = home + "output.txt"

# Step6: Make directory for log folder

In [218]:
path = "/content/log"
os.makedirs(path, exist_ok=True)

# Step7: set vaiables

In [219]:
worker_count = 2
file_size = sys.getsizeof(dataset2)
partition_size = 30
partition_count = int(file_size / partition_size)

# Step8: Manage and configure the SparkContext

In [220]:
conf = pyspark.SparkConf()\
 .setAppName("word_counter")\
 .set("spark.eventLog.enabled", "true") \
 .set("spark.eventLog.dir", log) \
 .setMaster("local[" + str(worker_count) + "]")

spark = pyspark.sql.SparkSession.builder\
 .config("spark.executor.memory", "10g") \
 .config("spark.driver.memory", "10g") \
 .config(conf=conf).getOrCreate()
sc = spark.sparkContext

# Step9: Run the program

In [264]:
rdd = sc.parallelize(dataset1)
counts = generate_ngrams(rdd, 2)
words = counts.collect()
print(words)
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

counts.coalesce(1).saveAsTextFile(output_dir)



# Step10: Clean the Garbage files

In [265]:
!mv "/content/output.txt/part-00000" "/content/result.txt"

In [266]:
!rm -r "/content/output.txt"