In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import math
import numpy as np
import re
import os
import pandas as pd
import os
from zipfile import ZipFile, BadZipFile
import json
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


## SEQUENCE -> SINGLE SCRIPT

In [3]:
scripts = [
    'hola guapete como estas',
    'yo muy bien gracias majete',
    'me alegro te veo el martes si dios',
    'okay yo seguro que llegaré lo mas pronto posible',
]

scripts = ['<sos> ' + script + ' <eos>' for script in scripts]
print(scripts)

['<sos> hola guapete como estas <eos>', '<sos> yo muy bien gracias majete <eos>', '<sos> me alegro te veo el martes si dios <eos>', '<sos> okay yo seguro que llegaré lo mas pronto posible <eos>']


In [4]:
MAX_SEQ_LEN = 128 # max num of words per phrase for translate

In [5]:
def build_vocab(scripts):
    """
    Construye un vocabulario a partir de una lista de scripts, creando dos diccionarios:
    - `block2idx`: Mapea cada bloque (palabra o token) a un índice único, con índices 0,1 y 2 reservados para '<pad>', '<unk>' y <SCRIPT_END>.
    - `idx2block`: Mapea cada índice de vuelta a su bloque correspondiente.

    La función cuenta la frecuencia de cada bloque, los ordena por frecuencia en orden descendente, y asigna índices a los bloques. 
    """
    blocks = [block for script in scripts for block in script.split() ]
    block_count = Counter(blocks)
    sorted_block_counts = sorted(block_count.items(), key=lambda x:x[1], reverse=True)
    block2idx = {
        '<pad>': 0,
        '<unk>': 1,
        '<SCRIPT_END>': 2
    }
    for idx, (block, _) in enumerate(sorted_block_counts, 3):
        block2idx[block] = idx
    idx2block = {idx: block for block, idx in block2idx.items()}
    return block2idx, idx2block


class TripletDataset(Dataset):
    def __init__(self, src_sentences, src_block2idx):
        self.src_sentences = src_sentences  # Anchor (source)
        self.src_block2idx = src_block2idx
        
    def __len__(self):
        return len(self.src_sentences)  # Usamos la longitud del conjunto positivo
    
    def __getitem__(self, idx):
        # Obtener las oraciones (anchor, positive, negative)
        src_sentence = self.src_sentences[idx]

        # Convertir cada oración en índices
        src_idxs = [self.src_block2idx.get(block, self.src_block2idx['<unk>']) for block in src_sentence.split()]
       
        # Retornar los tensores (anchor, positive, negative)
        return torch.tensor(src_idxs)

def collate_fn(batch):
    src_batch, pos_batch, neg_batch = zip(*batch)
    src_batch = [torch.tensor(seq) for seq in src_batch] 
    pos_batch = [torch.tensor(seq) for seq in pos_batch]
    neg_batch = [torch.tensor(seq) for seq in neg_batch]
    #print(trg_batch)
    # Hacemos el padding sin truncar primero
    #trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, batch_first=True, padding_value=0)
    #src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    #neg_batch = torch.nn.utils.rnn.pad_sequence(neg_batch, batch_first=True, padding_value=0)
    
    # Ahora truncamos las secuencias después del padding a MAX_SEQ_LEN
    #trg_batch = trg_batch[:, :MAX_SEQ_LEN]
    #src_batch = src_batch[:, :MAX_SEQ_LEN]
    #neg_batch = neg_batch[:, :MAX_SEQ_LEN]

    src_batch = [torch.nn.functional.pad(seq[:MAX_SEQ_LEN], (0, MAX_SEQ_LEN - len(seq[:MAX_SEQ_LEN])), value=0) for seq in src_batch]
    pos_batch = [torch.nn.functional.pad(seq[:MAX_SEQ_LEN], (0, MAX_SEQ_LEN - len(seq[:MAX_SEQ_LEN])), value=0) for seq in pos_batch]
    neg_batch = [torch.nn.functional.pad(seq[:MAX_SEQ_LEN], (0, MAX_SEQ_LEN - len(seq[:MAX_SEQ_LEN])), value=0) for seq in neg_batch]

    src_batch = torch.stack(src_batch)    
    pos_batch = torch.stack(pos_batch)
    neg_batch = torch.stack(neg_batch)
    return src_batch, pos_batch, neg_batcH

In [6]:
src_block2idx, src_idx2block = build_vocab(scripts)
src_vocab_size = len(src_block2idx)

print(src_block2idx)

{'<pad>': 0, '<unk>': 1, '<SCRIPT_END>': 2, '<sos>': 3, '<eos>': 4, 'yo': 5, 'hola': 6, 'guapete': 7, 'como': 8, 'estas': 9, 'muy': 10, 'bien': 11, 'gracias': 12, 'majete': 13, 'me': 14, 'alegro': 15, 'te': 16, 'veo': 17, 'el': 18, 'martes': 19, 'si': 20, 'dios': 21, 'okay': 22, 'seguro': 23, 'que': 24, 'llegaré': 25, 'lo': 26, 'mas': 27, 'pronto': 28, 'posible': 29}


In [7]:
dataset = TripletDataset(scripts, src_block2idx)
print("Len of scripts:", len(dataset))
print("First phrase:")
print(scripts[0])
print("First phrase converted to index:")
print(dataset[0])

Len of scripts: 4
First phrase:
<sos> hola guapete como estas <eos>
First phrase converted to index:
tensor([3, 6, 7, 8, 9, 4])


In [8]:
BATCH_SIZE = 1

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
for idx, batch in enumerate(dataloader):
    print(batch)

ValueError: too many values to unpack (expected 3)

## SEQUENCE -> ALL SCRIPTS OF PROJECT

In [9]:
scripts = [
    'hola guapete como estas',
    'yo muy bien gracias majete',
    'me alegro te veo el martes si dios',
    'okay yo seguro que llegaré lo mas pronto posible',
]

scripts = ' <SCRIPT_END> '.join(scripts)
scripts = '<sos> ' + scripts + ' <eos>'
print(scripts)

<sos> hola guapete como estas <SCRIPT_END> yo muy bien gracias majete <SCRIPT_END> me alegro te veo el martes si dios <SCRIPT_END> okay yo seguro que llegaré lo mas pronto posible <eos>


In [10]:
metrics = pd.read_csv('metrics_attr.csv')

# Filter by Action Genre
metrics_action = metrics[(metrics['Main Genre'] == 'Action')]
metrics_non_action = metrics[(metrics['Main Genre'] != 'Action')]


# Check the filenames for collect
filenames_action = list(metrics_action['Name'])
filenames_non_action = list(metrics_non_action['Name'])
# Create txt for made the shell script
with open('filenames_action_global.txt', 'w') as names:
     for name in filenames_action:
        names.write(name + '\n')


import math
def load_json_project(path_projectsb3):
    try:
        zip_file = ZipFile(path_projectsb3, "r")
        json_project = json.loads(zip_file.open("project.json").read())
        return json_project
    except BadZipFile:
        print('Bad zipfile')

def process(json_project):
    seq_num = 0
    dict_total_blocks = {}
    
    list_total_blocks = []

    for key, list_info in json_project.items():
        if key == "targets":
            for dict_target in list_info:
                target_name = dict_target.get('name')
                if target_name:
                    dict_total_blocks[target_name] = {}
                    dict_total_blocks[target_name][f'Seq_{seq_num}'] = []
                blocks = dict_target.get('blocks')
                if blocks:
                    for block_id, block_info in blocks.items():
                        if isinstance(block_info, dict):
                            topLevel = block_info.get('topLevel')
                            if topLevel:
                                seq_num += 1
                                dict_total_blocks[target_name][f'Seq_{seq_num}'] = []
                            opcode = block_info.get('opcode')
                            if opcode:
                                
                                dict_total_blocks[target_name][f'Seq_{seq_num}'].append(opcode)
                            list_total_blocks.append(block_info)
                            #dict_total_blocks[block_id] = block_info
    return dict_total_blocks


# ----------------- SCRIPTS GLOBAL (local context) ----------------------------------
dict_total_blocks = {}
scripts_global = []
list_total_blocks = []
print(len(filenames_action))
for project in filenames_action:
    sb3_path = os.path.join('.','sb3_action_global',project)
    if os.path.isfile(sb3_path):
        #print(project)
        json_project = load_json_project(sb3_path)
        dict_total_blocks = process(json_project)

        for sprite, seqs in dict_total_blocks.items():
            for idx, block_list in seqs.items():
                if block_list != []:
                    scripts_global.append(" ".join(block_list))

# ----------------- SCRIPTS TARGET ----------------------------------

scripts_train1 = []
scripts_train2 = []
for idx, project in enumerate(filenames_action):
    sb3_path = os.path.join('.','sb3_action_global',project)
    if os.path.isfile(sb3_path):
        #print(project)
        json_project = load_json_project(sb3_path)
        dict_total_blocks = process(json_project)

        for sprite, seqs in dict_total_blocks.items():
            for block_list in seqs.values():
                if block_list != []:
                    if idx < math.floor(len(filenames_action)/2):
                        scripts_train1.append(" ".join(block_list))
                    else:
                        scripts_train2.append(" ".join(block_list))


# ----------------- SCRIPTS NEGTATIVE  ----------------------------------
scripts_train3 = []
for project in filenames_non_action:
    sb3_path = os.path.join('.','sb3_non_action',project)
    if os.path.isfile(sb3_path):
        #print(project)
        json_project = load_json_project(sb3_path)
        dict_total_blocks = process(json_project)

        for sprite, seqs in dict_total_blocks.items():
            for idx, block_list in seqs.items():
                if block_list != []:
                    scripts_train3.append(" ".join(block_list))

# ----------------- SCRIPTS GLOBAL (global context)  ----------------------------------

dict_total_blocks = {}
scripts_global_global = []
list_total_blocks = []
print(len(filenames_action))
print(filenames_action[0])
for project in filenames_action:
    
    sb3_path = os.path.join('.','sb3_action_global',project)
    if os.path.isfile(sb3_path):
        #print(project)
        json_project = load_json_project(sb3_path)
        dict_total_blocks = process(json_project)

        for sprite, seqs in dict_total_blocks.items():
            global_seqs = []
            #print("project:", project)
            #print(seqs)
            #print("Sprite",sprite)
            
            for idx, block_list in seqs.items():
                if block_list != []:
                    global_seqs.append(" ".join(block_list))
                    global_seqs.append("<SCRIPT_END>")
            if global_seqs != []:
                global_seqs.pop(-1)
            
            scripts_global_global.append(" ".join(global_seqs))


# ----------------- SCRIPTS TARGET (POSITIV) (global context)  ----------------------------------

scripts_train1_global = []
scripts_train2_global = []
dict_total_blocks = {}
list_total_blocks = []
print(len(filenames_action))
for idx, project in enumerate(filenames_action):
    
    sb3_path = os.path.join('.','sb3_action_global',project)
    if os.path.isfile(sb3_path):
        #print(project)
        json_project = load_json_project(sb3_path)
        dict_total_blocks = process(json_project)

        for sprite, seqs in dict_total_blocks.items():
            global_seqs = []
            #print("project:", project)
            #print(seqs)
            #print("Sprite",sprite)
            
            for block_list in seqs.values():
                if block_list != []:  
                    global_seqs.append(" ".join(block_list))
                    global_seqs.append("<SCRIPT_END>")
            if global_seqs != []:
                global_seqs.pop(-1)
        
            if int(idx) < math.floor(len(filenames_action)/2):
                scripts_train1_global.append(" ".join(global_seqs))
            else:
                scripts_train2_global.append(" ".join(global_seqs))

# ----------------- SCRIPTS NEGATIVE (global context) -----------------------------

dict_total_blocks = {}
scripts_train3_global= []
list_total_blocks = []
print(len(filenames_non_action))
for project in filenames_non_action:
    
    sb3_path = os.path.join('.','sb3_non_action',project)
    if os.path.isfile(sb3_path):
        #print(project)
        json_project = load_json_project(sb3_path)
        dict_total_blocks = process(json_project)

        for sprite, seqs in dict_total_blocks.items():
            global_seqs = []
            #print("project:", project)
            #print(seqs)
            #print("Sprite",sprite)
            
            for idx, block_list in seqs.items():
                if block_list != []:
                    global_seqs.append(" ".join(block_list))
                    global_seqs.append("<SCRIPT_END>")
            if global_seqs != []:
                global_seqs.pop(-1)
            
            scripts_train3_global.append(" ".join(global_seqs))



# scripts_global -> global scripts (local context)... each index of the list its an scripts
# scripts_train1 -> src
# scritps_train2 -> trg
# scritps_train3 -> negative
# scripts_global_global -> global scripts (global context)... each index of the list its all project scripts 

# --------------------- LOCAL CONTEXT ---------------------------------------
scripts_global = ['<sos> ' + script + ' <eos>' for script in scripts_global]
scripts_train1 = ['<sos> ' + script + ' <eos>' for script in scripts_train1]
scripts_train2 = ['<sos> ' + script + ' <eos>' for script in scripts_train2]
scripts_train3 = ['<sos> ' + script + ' <eos>' for script in scripts_train3]

# --------------------- GLOBAL CONTEXT ---------------------------------------
scripts_global_global = ['<sos> ' + script + ' <eos>' for script in scripts_global_global]
scripts_train1_global = ['<sos> ' + script + ' <eos>' for script in scripts_train1_global]
scripts_train2_global = ['<sos> ' + script + ' <eos>' for script in scripts_train2_global]
scripts_train3_global = ['<sos> ' + script + ' <eos>' for script in scripts_train3_global]

print(scripts_global_global)
print("lenssss-------")
print(len(scripts_global))
print(len(scripts_train1))
print(len(scripts_train2))
print(len(scripts_train3))

print(len(scripts_global_global))
print(len(scripts_train1_global))
print(len(scripts_train2_global))
print(len(scripts_train3_global))

print("-----MAX SEQ LEN-------------")
for script in scripts_global_global:
    print(script)
    long_seq = len(script.split())
    print(long_seq)



312
312
Abby and Grace's project.sb3
312
328


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [13]:

for script in scripts_train1_global:
    print(script)
    long_seq = len(script.split())
    print(long_seq)

<sos> event_whenflagclicked data_showvariable data_setvariableto <SCRIPT_END> event_whenflagclicked data_showvariable data_setvariableto control_repeat_until operator_equals control_wait data_changevariableby sound_playuntildone sound_sounds_menu data_hidevariable data_hidevariable looks_switchbackdropto looks_backdrops <SCRIPT_END> event_whenbackdropswitchesto data_showvariable <SCRIPT_END> event_whenflagclicked control_forever sound_playuntildone sound_sounds_menu <eos>
27
<sos> event_whenflagclicked control_forever control_wait looks_show motion_gotoxy motion_glidesecstoxy looks_hide control_wait <SCRIPT_END> event_whenthisspriteclicked looks_hide sound_play sound_sounds_menu data_changevariableby <SCRIPT_END> event_whenbackdropswitchesto control_forever looks_hide <SCRIPT_END> event_whenflagclicked looks_switchbackdropto looks_backdrops <eos>
24
<sos> event_whenflagclicked control_forever control_wait looks_show motion_gotoxy motion_glidesecstoxy looks_hide control_wait <SCRIPT_END

## SKIPGRAM NOTES

In [9]:
class SkipGramData(Dataset):
    def __init__(self, corpus, window_sz= 4):
        super().__init__
        self.corpus = corpus
        self.window = window_sz
        self.vocab = list(set(token.lower() for sentence in self.corpus for token in sentence.split())) # set elimina dups y no mantiene el orden
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.data = self.gen_dataset()

    def gen_dataset(self):
        # Metod for enventanado
        data = []
        for sentence in self.corpus:
            text = sentence.lower().split()
            #print(text)
            for center_idx, center_word in enumerate(text):
                for offset in range(-self.window, self.window +1):
                    context_idx = center_idx + offset
                    if context_idx < 0 or context_idx >= len(text) or context_idx == center_idx: continue
                    context_word = text[context_idx]
                    data.append((self.word2idx[center_word], self.word2idx[context_word]))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [10]:
dataset = SkipGramData(scripts)

In [11]:
print(dataset.word2idx)

{'muy': 0, 'te': 1, '<sos>': 2, 'gracias': 3, 'llegaré': 4, 'bien': 5, 'majete': 6, 'mas': 7, 'alegro': 8, 'el': 9, 'hola': 10, 'posible': 11, 'seguro': 12, 'lo': 13, 'martes': 14, 'pronto': 15, '<eos>': 16, 'si': 17, 'estas': 18, 'yo': 19, 'me': 20, 'que': 21, 'guapete': 22, 'como': 23, 'okay': 24, 'dios': 25, 'veo': 26}


In [12]:
print(scripts[0])
for i in range(0, 20):
    print(dataset.idx2word[dataset[i][0]],"/",dataset.idx2word[dataset[i][1]])

<sos> hola guapete como estas <eos>
<sos> / hola
<sos> / guapete
<sos> / como
<sos> / estas
hola / <sos>
hola / guapete
hola / como
hola / estas
hola / <eos>
guapete / <sos>
guapete / hola
guapete / como
guapete / estas
guapete / <eos>
como / <sos>
como / hola
como / guapete
como / estas
como / <eos>
estas / <sos>


In [13]:
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
for center, context in dataloader:
    center_vector = torch.zeros(len(dataloader.dataset.vocab)) 
    center_vector[center] = 1.0
    center_vector = center_vector.unsqueeze(0)  
    print("Center vector:")
    print(center_vector)
    print("Context:")
    print(context)

Center vector:
tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])
Context:
tensor([3])
Center vector:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0.]])
Context:
tensor([7])
Center vector:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0.]])
Context:
tensor([24])
Center vector:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0.]])
Context:
tensor([19])
Center vector:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])
Context:
tensor([17])
Center vector:
tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])
Context:
tensor([5])
Center vector