In [18]:
from pyspark.sql.functions import col, lit
import numpy as np
import math
import heapq

In [19]:


def filter_data(data, target_class, target_col):
    return data.filter(col(target_col) = target_class)
#4: 	data+ ← FilterData() #ottieni dati con solo la label di interesse (potrebbe essere pre-processabile + distribuita)
#5: 	for all sequence in data+ do
#6: 		scoresucb.add(sequence, ∞) # idem per questo
#7: 	end for



def seq_scout(data, data_plus,target_class, numeric_values, top_k, iterations, theta, alpha):
    
    # create priority queue for patterns to be stored
    pi = PriorityQueue(k=top_k, theta=theta) #TODO: la funzione non filtra se theta=1
    
    # create priority queue for storing each class sequence and its UCB score
    scores = PriorityQueue(data_plus)
    N = 1
    while N<iterations:
        seq, quality, Ni = scores.popNext() # pop the sequence to be generalized
        
        # generalize the sequence and add it to the patterns
        gen_seq, new_qual = play_arm(seq, data, target_class, numeric_values, alpha)
        pi.add(gen_seq, new_qual)
        
        # update the quality and put back the sequence in the priority queue
        updated_quality = (Ni * mean_quality + quality) / (Ni + 1)
        ucb_score = compute_ucb(updated_quality, Ni + 1, N)
        scores.add(sequence, ucb_score, Ni + 1, updated_quality)
        
        N += 1
    
    return pi.topKNonRedundant()


#1: function SEQSCOUT(budget)
#2: 	π ← PriorityQueue()
#3: 	scores ← PriorityQueue() # ! sfruttare i dataframe distribuiti di spark

#8: 	|while budget do 
#9: 	|	seq, qual, Ni ← scores.bestUCB()
#10: |	seqp, qualp ← PlayArm(seq) #calcolo qualità parallelizzabile
#11: |	π.add(seqp, qualp)
#12: |	scores.update(seq,Ni*qual+qualp/Ni+1 , Ni + 1)
#3: |end while # while eseguito per ogni top esempio - non parallelizzabile?
#4:  
#15: return π.topKNonRedundant() # filtering (remove similar starting from the beginning)
#16: end function

#- il filtering dei dati penso possa essere fatto automaticamente con una bella filter
#- controlla come funziona la max del DataFrame
#- possibile parallelizzazione 1 per ogni classe (a livello di container -> 7 esecutori max)
#- possibile parallelizzazione sul calcolo della metrica come map + reduce
#- priority queue con i dataframe distribuiti non ha senso, ma pi può essere implementata easy come una lista
#	che flitra automaticamente i migliori k

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (3485452347.py, line 2)

In [20]:
# Create example data and heapify
a = list(range(10))
print(a)
a.reverse()
heapq.heapify(a)
print(a)
heapq.heappush(a,2.7)
print(a)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 3, 2, 5, 4, 7, 9, 6, 8]
[0, 1, 3, 2, 2.7, 4, 7, 9, 6, 8, 5]


In [38]:
def read_dataset(path):
    DISCRETE_INPUTS = {'up', 'accelerate', 'slow', 'goal', 'left', 'boost', 'camera', 'down', 'right', 'slide', 'jump'}
    data = []
    with open(path, "r") as file:
        dict_headers = next(file).split()
        new_line = dict()
        for line in file:
            if len(line.split()) <= 1:
                if new_line:
                    data.append(new_line)
                new_line = {"sequence": [] ,"class": line.strip()}
            else:
                if len(dict_headers) != len(line.split()):
                    raise ValueError('Number of data and variables do not match')

                numerics = {}
                buttons = []

                for i, value in enumerate(line.split()):
                    if dict_headers[i] in DISCRETE_INPUTS:
                        if value == '1':
                            buttons.append(dict_headers[i])
                    else:
                        numerics[dict_headers[i]] = float(value)

                state = [buttons, numerics]
                new_line["sequence"].append(state)
        data.append(new_line)
    return data


In [39]:
data = read_dataset("/vagrant/rocket_league_skillshots.data")
# in case of bigger datasets, single splits could be generated on different nodes
# and after joined as single json file

In [40]:
import json
spark = SparkSession.builder.appName("abalone").getOrCreate()
with open("source.json", "w") as s:
    s.write(json.dumps(data))

In [64]:
df = spark.read.format("json").load("source.json")

In [68]:
df.printSchema()

root
 |-- class: string (nullable = true)
 |-- sequence: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)



In [None]:
schema = StructType([
    StructField("class", IntegerType(), False),
    StructField("sequence", ArrayType(
        ArrayType(ArrayType
        )
    ,False), False)
])

In [67]:
df.select(col("sequence")).show()

+--------------------+
|            sequence|
+--------------------+
|[[["right","jump"...|
|[[["boost","right...|
|[[["right","jump"...|
|[[["right","slide...|
|[[["right"], {"Ba...|
|[[["boost","right...|
|[[["down","right"...|
|[[["right"], {"Ba...|
|[[["right","jump"...|
|[[["slide"], {"Ba...|
|[[["boost","right...|
|[[["right"], {"Ba...|
|[[["right","slide...|
|[[["right","slide...|
|[[["right","slide...|
|[[["right","slide...|
|[[["down","slide"...|
|[[["right"], {"Ba...|
|[[["right"], {"Ba...|
|[[["right"], {"Ba...|
+--------------------+
only showing top 20 rows



In [70]:
df = df.withColumn("UCB", lit(-np.inf)).withColumn("Ni", lit(0)).withColumn("WRAcc", lit(0))


In [71]:
df.show()

+-----+--------------------+---------+---+-----+
|class|            sequence|      UCB| Ni|WRAcc|
+-----+--------------------+---------+---+-----+
|    6|[[["right","jump"...|-Infinity|  0|    0|
|   -1|[[["boost","right...|-Infinity|  0|    0|
|   -1|[[["right","jump"...|-Infinity|  0|    0|
|   -1|[[["right","slide...|-Infinity|  0|    0|
|   -1|[[["right"], {"Ba...|-Infinity|  0|    0|
|    6|[[["boost","right...|-Infinity|  0|    0|
|    1|[[["down","right"...|-Infinity|  0|    0|
|    7|[[["right"], {"Ba...|-Infinity|  0|    0|
|    1|[[["right","jump"...|-Infinity|  0|    0|
|    6|[[["slide"], {"Ba...|-Infinity|  0|    0|
|    2|[[["boost","right...|-Infinity|  0|    0|
|    1|[[["right"], {"Ba...|-Infinity|  0|    0|
|    7|[[["right","slide...|-Infinity|  0|    0|
|    6|[[["right","slide...|-Infinity|  0|    0|
|    2|[[["right","slide...|-Infinity|  0|    0|
|    1|[[["right","slide...|-Infinity|  0|    0|
|    2|[[["down","slide"...|-Infinity|  0|    0|
|    1|[[["right"], 

In [63]:
df.schema

StructType([StructField('class', StringType(), True), StructField('sequence', ArrayType(ArrayType(StringType(), True), True), True), StructField('UCB', DoubleType(), False), StructField('Ni', IntegerType(), False), StructField('WRAcc', IntegerType(), False)])

In [72]:
a = df.select(df.columns[2], df.columns[3], df.columns[4],df.columns[1]).take(1)
a = a[0]["sequence"]
#tuple([tuple([frozenset(i[0]), tuple(sorted(i[1].items()))]) for i in a])

In [73]:
a

[['["right","jump"]',
  '{"BallAcceleration":1636.7987723122642,"Time":0.0,"DistanceWall":3498.01,"DistanceCeil":2012.98,"DistanceBall":299.6682700921136,"PlayerSpeed":104267.42623178152,"BallSpeed":99035.84933750001}'],
 ['["boost","right","jump"]',
  '{"BallAcceleration":3198.029396508704,"Time":0.13889319999999827,"DistanceWall":3494.08,"DistanceCeil":2012.98,"DistanceBall":229.89677966426592,"PlayerSpeed":124248.03198843835,"BallSpeed":102233.87873400871}'],
 ['["boost"]',
  '{"BallAcceleration":0.0,"Time":0.1736165000000014,"DistanceWall":3494.08,"DistanceCeil":2012.98,"DistanceBall":237.35059911447462,"PlayerSpeed":124248.03198843835,"BallSpeed":102968.35898954592}'],
 ['["right"]',
  '{"BallAcceleration":9914.766241818943,"Time":0.3125095999999985,"DistanceWall":3500.08,"DistanceCeil":2012.98,"DistanceBall":151.8809207899399,"PlayerSpeed":115248.01600895349,"BallSpeed":112883.12523136486}'],
 ['["right","jump"]',
  '{"BallAcceleration":5907.747166307177,"Time":0.6250190999999994

In [28]:
class PriorityQueue(object):
    def __init__(self, data=None, k=None,theta=None, cap_length=False):
        self.k = k
        self.theta=theta
        self.cap_length=cap_length if k is not None else False
        if data is not None:  
            self.heap = [tuple(x) for x in data.collect()]
            print(type(self.heap[0][-1]))
            self.seq_set = set([i[-1] for i in self.heap])
            heapq.heapify(self.heap)
            if cap_length and len(self.heap)>self.k:
                heap = heapq.nlargest(self.k, self.heap)
        else:
            self.heap = []
            self.seq_set = set()

    def add(self, elem):
        if elem[-1] not in seq_set:
            heapq.heappush(self.heap, elem)
            if cap_length and len(self.heap)>self.k:
                heap = heapq.nlargest(self.k, self.heap)
                #TODO add filtering if necessary
    def get_first(self):
        return heapq.heappop(self.heap)
    
    def get_top_k(self):
        if theta == 1:
            return heap.nlargest(self.k)
        else:
            return 0
            #TODO add filtering
            
    
        

In [29]:
data = df.select(df.columns[2], df.columns[3], df.columns[4],df.columns[1]).limit(2)
data.show()
pq = PriorityQueue(data)

+---------+---+-----+--------------------+
|      UCB| Ni|WRAcc|            sequence|
+---------+---+-----+--------------------+
|-Infinity|  0|    0|[[[right, jump], ...|
|-Infinity|  0|    0|[[[boost, right, ...|
+---------+---+-----+--------------------+

<class 'list'>


TypeError: unhashable type: 'list'

In [75]:
a = {"a":4,"c":2 ,"b":1}
sorted(a.items())

[('a', 4), ('b', 1), ('c', 2)]