In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.4-bin-hadoop3.2'

In [30]:
# İmporting libraries
from pyspark import SparkContext
from itertools import combinations
import json

#Initialize Spark Context
sparkCtx = SparkContext.getOrCreate()

#Read the file into RDD
lotteryData = sparkCtx.textFile("/content/Lottery.txt")

#Function to split and prepare JSON strings
def prepare_jsons(chunk):
    jsons = []
    accumulator = ""
    for part in chunk:
        accumulator += part
        if accumulator.count('{') == accumulator.count('}'):
            jsons.append(accumulator)
            accumulator = ""
    return jsons

#Process and split into individual JSONs
jsonStringsRDD = lotteryData.flatMap(prepare_jsons)

#Convert string to JSON objects
def to_json(string):
    try:
        return [json.loads(string)]
    except:
        return []

jsonObjectsRDD = jsonStringsRDD.flatMap(to_json)

# Extract numbers and generate triples
def generate_triples(json_object):
    numbers = json_object["data"]["rakamlar"].split('#')
    return combinations(numbers, 3)

triplesRDD = jsonObjectsRDD.flatMap(generate_triples)

#Count and sort
triplesCount = triplesRDD.countByValue()
sortedTriples = sorted(triplesCount.items(), key=lambda x: x[1], reverse=True)

# Display results
topTriple = sortedTriples[0]
print(f"The top recurring combination is: {topTriple[0]}, seen {topTriple[1]} times.")

# verification
print("Few Combination Examples:", triplesRDD.take(3))
print("Few Number Sets:", jsonObjectsRDD.map(lambda x: x["data"]["rakamlar"].split('#')).take(3))


The top recurring combination is: ('23', '39', '47'), seen 7 times.
Few Combination Examples: [('05', '15', '20'), ('05', '15', '32'), ('05', '15', '34')]
Few Number Sets: [['05', '15', '20', '32', '34', '47'], ['03', '14', '16', '23', '27', '43'], ['06', '22', '26', '31', '38', '44']]
