In [8]:
from pyspark.sql.functions import col, lit
import numpy as np
import math
import heapq

In [3]:
#
# cose da implementare:
#  dataframe con valori unici dei numerici indicizzati
#  funzione per calcolare la qualità dei pattern
#  adattare il tutto sul distribuito
#  funzione per codificare il dataset
#  funzioni per il training + grid search
#  funzioni per l'esplorazione dei dati

3

In [19]:


def filter_data(data, target_class, target_col): #t
    return data.filter(col(target_col) = target_class)

def seq_scout(data, data_plus,target_class, numerics_domains, top_k, iterations, theta, alpha): #t
    
    # create priority queue for patterns to be stored
    pi = PriorityQueue(k=top_k, theta=theta) 
    
    # create priority queue for storing each class sequence and its UCB score
    scores = PriorityQueue(data_plus)
    N = 1
    while N<iterations:
        _, Ni, quality, sequence = scores.pop_first() # pop the sequence to be generalized
        
        # generalize the sequence and add it to the patterns
        gen_seq, new_qual = play_arm(seq, data, target_class, numerics_domains, alpha)
        pi.add((new_qual, to_imm_pattern(gen_seq)))
        
        # update the quality and put back the sequence in the priority queue
        updated_quality = (Ni * mean_quality + quality) / (Ni + 1)
        ucb_score = compute_ucb(updated_quality, Ni + 1, N)
        scores.add(ucb_score, Ni + 1, updated_quality, sequence)
        
        N += 1
    
    return pi.get_top_k() # priority queue filters automatically if theta <1

def play_arm(sequence, data, target_class, numerics_domains, alpha): #ŧ
    sequence = mutable_seq_copy(sequence)
    # get the number of button pressed in the sequence
    tot_num_inputs = len([len(input_set[0]) for input_set in sequence])
    
    # get a random number of input to be removed
    input_to_remove = random.randint(0, tot_num_inputs-1)
    
    for i in range(input_to_remove):
        selected_state_idx = random.randint(0, len(sequence)-1)
        selected_state = sequence[selected_state_idx][0] # we take the input itemset
        
        selected_state.remove(random.sample(selected_state, 1)[0]) # remove an element
        
        if len(selected_state) == 0: # if the state looses all the inputs, then it is removed
            sequence.pop(selected_state_idx)
    
    for _, numerics in sequence:
        for kind, value in numerics.items():
            # first we decide whether to remove the constraint or not
            if random.random() < alpha:
                numerics[kind] = [-float('inf'), float('inf')]
            else:
                # possible improvement is to sample directly from the total span (it's faster, but values could be clustered)
                i_value = numerics_domains.select(col(kind) == value).head()["idx"]
                df_len = numerics_domains.tail()["idx"]
                
                left_idx = random.randint(0, i_value)
                right_idx = random.randint(i_value, df_len)
                left_value = numerics_domains.select(col("idx") == left_idx).head()[kind]
                right_value = numerics_domains.select(col("idx") == right_idx).head()[kind]
                
                numerics[kind] = [left_value, right_value]


    # now we compute the quality measure
    quality = compute_WRAcc(data, sequence, target_class)

    return sequence, quality

def compute_ucb(score, Ni, N): #t
    # we choose C = 0.5
    return (score + 0.25) * 2 + 0.5 * math.sqrt(2 * math.log(N) / Ni)



#1: function SEQSCOUT(budget)
#2: 	π ← PriorityQueue()
#3: 	scores ← PriorityQueue() # ! sfruttare i dataframe distribuiti di spark

#8: 	|while budget do 
#9: 	|	seq, qual, Ni ← scores.bestUCB()
#10: |	seqp, qualp ← PlayArm(seq) #calcolo qualità parallelizzabile
#11: |	π.add(seqp, qualp)
#12: |	scores.update(seq,Ni*qual+qualp/Ni+1 , Ni + 1)
#3: |end while # while eseguito per ogni top esempio - non parallelizzabile?
#4:  
#15: return π.topKNonRedundant() # filtering (remove similar starting from the beginning)
#16: end function

#- il filtering dei dati penso possa essere fatto automaticamente con una bella filter
#- controlla come funziona la max del DataFrame
#- possibile parallelizzazione 1 per ogni classe (a livello di container -> 7 esecutori max)
#- possibile parallelizzazione sul calcolo della metrica come map + reduce
#- priority queue con i dataframe distribuiti non ha senso, ma pi può essere implementata easy come una lista
#	che flitra automaticamente i migliori k

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (3485452347.py, line 2)

In [9]:
def read_dataset(path):
    DISCRETE_INPUTS = {'up', 'accelerate', 'slow', 'goal', 'left', 'boost', 'camera', 'down', 'right', 'slide', 'jump'}
    data = []
    with open(path, "r") as file:
        dict_headers = next(file).split()
        new_line = dict()
        for line in file:
            if len(line.split()) <= 1:
                if new_line:
                    data.append(new_line)
                new_line = {"input_sequence": [] ,"num_sequence":[],"class": line.strip()}
            else:
                if len(dict_headers) != len(line.split()):
                    raise ValueError('Number of data and variables do not match')

                numerics = {}
                buttons = []

                for i, value in enumerate(line.split()):
                    if dict_headers[i] in DISCRETE_INPUTS:
                        if value == '1':
                            buttons.append(dict_headers[i])
                    else:
                        numerics[dict_headers[i]] = float(value)

                #state = [buttons, numerics]
                new_line["input_sequence"].append(buttons)
                new_line["num_sequence"].append(numerics)
        data.append(new_line)
    return data


In [10]:
data = read_dataset("/vagrant/rocket_league_skillshots.data")
# in case of bigger datasets, single splits could be generated on different nodes
# and after joined as single json file

In [11]:
import json
spark = SparkSession.builder.appName("abalone").getOrCreate()
with open("source.json", "w") as s:
    s.write(json.dumps(data))

23/02/04 23:40:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [12]:
df = spark.read.format("json").load("source.json")

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [13]:
df.printSchema()

root
 |-- class: string (nullable = true)
 |-- input_sequence: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- num_sequence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- BallAcceleration: double (nullable = true)
 |    |    |-- BallSpeed: double (nullable = true)
 |    |    |-- DistanceBall: double (nullable = true)
 |    |    |-- DistanceCeil: double (nullable = true)
 |    |    |-- DistanceWall: double (nullable = true)
 |    |    |-- PlayerSpeed: double (nullable = true)
 |    |    |-- Time: double (nullable = true)



In [14]:
df.show()

+-----+--------------------+--------------------+
|class|      input_sequence|        num_sequence|
+-----+--------------------+--------------------+
|    6|[[right, jump], [...|[{1636.7987723122...|
|   -1|[[boost, right, j...|[{0.0, 33685.8395...|
|   -1|[[right, jump], [...|[{124246.29375405...|
|   -1|[[right, slide, j...|[{-8210.634011562...|
|   -1|[[right], [boost,...|[{1197.5360615055...|
|    6|[[boost, right, j...|[{14578.192522981...|
|    1|[[down, right], [...|[{0.0, 170001.715...|
|    7|[[right], [right,...|[{4250.8600994742...|
|    1|[[right, jump], [...|[{-8323.881952792...|
|    6|[[slide], [right]...|[{31754.862957944...|
|    2|[[boost, right, j...|[{-301.7738125810...|
|    1|[[right], [right,...|[{0.0, 25761.9695...|
|    7|[[right, slide], ...|[{0.0, 123286.730...|
|    6|[[right, slide, j...|[{6963.6507997466...|
|    2|[[right, slide, j...|[{-4735.002465468...|
|    1|[[right, slide, j...|[{-9574.096758853...|
|    2|[[down, slide], [...|[{-1985.327081700...|


In [15]:
len(df.select("num_sequence").take(1)[0]["num_sequence"])

32

In [16]:
df.select("input_sequence").take(1)[0]["input_sequence"]

[['right', 'jump'],
 ['boost', 'right', 'jump'],
 ['boost'],
 ['right'],
 ['right', 'jump'],
 ['boost'],
 ['right'],
 ['boost', 'right'],
 ['right', 'jump'],
 ['right'],
 ['boost', 'right'],
 ['right'],
 ['right', 'jump'],
 ['right'],
 ['right', 'slide'],
 ['right'],
 ['right', 'slide'],
 ['right', 'slide', 'jump'],
 ['right', 'jump'],
 ['right', 'slide'],
 ['goal', 'slide'],
 ['goal'],
 ['accelerate', 'camera'],
 ['accelerate'],
 ['accelerate', 'camera', 'right'],
 ['accelerate', 'right'],
 ['boost', 'right'],
 ['right'],
 ['slow', 'right', 'slide'],
 ['right', 'slide', 'jump'],
 ['right', 'jump'],
 ['boost', 'right']]

In [17]:
df = df.withColumn("UCB", lit(-np.inf)).withColumn("Ni", lit(0)).withColumn("WRAcc", lit(0))


In [18]:
df.show()

+-----+--------------------+--------------------+---------+---+-----+
|class|      input_sequence|        num_sequence|      UCB| Ni|WRAcc|
+-----+--------------------+--------------------+---------+---+-----+
|    6|[[right, jump], [...|[{1636.7987723122...|-Infinity|  0|    0|
|   -1|[[boost, right, j...|[{0.0, 33685.8395...|-Infinity|  0|    0|
|   -1|[[right, jump], [...|[{124246.29375405...|-Infinity|  0|    0|
|   -1|[[right, slide, j...|[{-8210.634011562...|-Infinity|  0|    0|
|   -1|[[right], [boost,...|[{1197.5360615055...|-Infinity|  0|    0|
|    6|[[boost, right, j...|[{14578.192522981...|-Infinity|  0|    0|
|    1|[[down, right], [...|[{0.0, 170001.715...|-Infinity|  0|    0|
|    7|[[right], [right,...|[{4250.8600994742...|-Infinity|  0|    0|
|    1|[[right, jump], [...|[{-8323.881952792...|-Infinity|  0|    0|
|    6|[[slide], [right]...|[{31754.862957944...|-Infinity|  0|    0|
|    2|[[boost, right, j...|[{-301.7738125810...|-Infinity|  0|    0|
|    1|[[right], [ri

In [19]:
a = df.select(df.columns[3], df.columns[4],df.columns[5], df.columns[1], df.columns[2]).show()

+---------+---+-----+--------------------+--------------------+
|      UCB| Ni|WRAcc|      input_sequence|        num_sequence|
+---------+---+-----+--------------------+--------------------+
|-Infinity|  0|    0|[[right, jump], [...|[{1636.7987723122...|
|-Infinity|  0|    0|[[boost, right, j...|[{0.0, 33685.8395...|
|-Infinity|  0|    0|[[right, jump], [...|[{124246.29375405...|
|-Infinity|  0|    0|[[right, slide, j...|[{-8210.634011562...|
|-Infinity|  0|    0|[[right], [boost,...|[{1197.5360615055...|
|-Infinity|  0|    0|[[boost, right, j...|[{14578.192522981...|
|-Infinity|  0|    0|[[down, right], [...|[{0.0, 170001.715...|
|-Infinity|  0|    0|[[right], [right,...|[{4250.8600994742...|
|-Infinity|  0|    0|[[right, jump], [...|[{-8323.881952792...|
|-Infinity|  0|    0|[[slide], [right]...|[{31754.862957944...|
|-Infinity|  0|    0|[[boost, right, j...|[{-301.7738125810...|
|-Infinity|  0|    0|[[right], [right,...|[{0.0, 25761.9695...|
|-Infinity|  0|    0|[[right, slide], ..

In [20]:
#a = df.select(df.columns[3], df.columns[4],df.columns[5], df.columns[1], df.columns[2]).take(1)
a = df.select(df.columns[1], df.columns[2]).take(1)
a = a[0]
list(a[0])
tuple([tuple([frozenset(a[0][i]), tuple(sorted(a[1][i].asDict().items()))]) for i in range(len(a[0]))])

((frozenset({'jump', 'right'}),
  (('BallAcceleration', 1636.7987723122642),
   ('BallSpeed', 99035.84933750001),
   ('DistanceBall', 299.6682700921136),
   ('DistanceCeil', 2012.98),
   ('DistanceWall', 3498.01),
   ('PlayerSpeed', 104267.42623178152),
   ('Time', 0.0))),
 (frozenset({'boost', 'jump', 'right'}),
  (('BallAcceleration', 3198.029396508704),
   ('BallSpeed', 102233.87873400871),
   ('DistanceBall', 229.89677966426592),
   ('DistanceCeil', 2012.98),
   ('DistanceWall', 3494.08),
   ('PlayerSpeed', 124248.03198843835),
   ('Time', 0.13889319999999827))),
 (frozenset({'boost'}),
  (('BallAcceleration', 0.0),
   ('BallSpeed', 102968.35898954592),
   ('DistanceBall', 237.35059911447462),
   ('DistanceCeil', 2012.98),
   ('DistanceWall', 3494.08),
   ('PlayerSpeed', 124248.03198843835),
   ('Time', 0.1736165000000014))),
 (frozenset({'right'}),
  (('BallAcceleration', 9914.766241818943),
   ('BallSpeed', 112883.12523136486),
   ('DistanceBall', 151.8809207899399),
   ('Distanc

In [31]:
def import_imm_sequence(seq):
    return tuple([tuple([frozenset(seq[0][i]), tuple(sorted(seq[1][i].asDict().items()))]) for i in range(len(seq[0]))])
def mutable_seq_copy(seq):
    copy = []
    for i in seq:
        input_set = set(i[0])
        num_dict = {j[0] : j[1] for j in i[1]}
        copy.append([input_set, num_dict])
    return copy
        
def to_imm_pattern(pattern): #t
    return tuple([tuple([frozenset(i[0]), tuple(sorted([(key, tuple(value)) for key, value in i[1].items()]))]) for i in
                  pattern])

In [22]:
class PriorityQueue(object):
    def __init__(self, data=None, k=1,theta=1, cap_length=False):
        self.k = k
        self.theta=theta
        self.cap_length=cap_length if k is not None else False
        if data is not None:  
            self.heap = [tuple([x[0], x[1], x[2], import_imm_sequence(x[3:])]) for x in data.collect()]
            self.seq_set = set([i[-1] for i in self.heap])
            heapq.heapify(self.heap)
            if cap_length and len(self.heap)>self.k:
                heap = heapq.nlargest(self.k, self.heap)
        else:
            self.heap = []
            self.seq_set = set()

    def add(self, elem):
        if elem[-1] not in self.seq_set:
            heapq.heappush(self.heap, elem)
            self.seq_set.add(elem[-1])
            if self.cap_length and len(self.heap)>self.k:
                self.heap = heapq.nsmallest(self.k, self.heap)
                #TODO add filtering if necessary
    def pop_first(self):
        return heapq.heappop(self.heap)
    
    def get_top_k(self):
        if self.theta == 1:
            return heapq.nsmallest(self.k, self.heap)
        else:
            return 0
            #TODO add filtering
    
    
    
        

In [23]:
data = df.select(df.columns[3], df.columns[4],df.columns[5], df.columns[1], df.columns[2]).limit(2)
data.show()
pq = PriorityQueue(data, k=2)

+---------+---+-----+--------------------+--------------------+
|      UCB| Ni|WRAcc|      input_sequence|        num_sequence|
+---------+---+-----+--------------------+--------------------+
|-Infinity|  0|    0|[[right, jump], [...|[{1636.7987723122...|
|-Infinity|  0|    0|[[boost, right, j...|[{0.0, 33685.8395...|
+---------+---+-----+--------------------+--------------------+



In [24]:
data = df.select(df.columns[3], df.columns[4],df.columns[5], df.columns[1], df.columns[2]).take(3)
data = data[2]

In [25]:
b = pq.get_top_k()