In [1]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import glob
import random
from data_loader import full_load_map, data_dir, load_map, encode_empty, encode_to_array, Note
from concurrent.futures import ProcessPoolExecutor
import json
import shutil
import os

In [2]:
random_seed = 6969

random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [3]:
def data_generator_single_process(map_folders):
    for map_folder in map_folders:
        try:
            results = full_load_map(map_folder.decode('UTF-8'))
            for result in results:
                x_context_prev_audio, x_context_prev_notes, x_context_audio, y_context_notes = result
                yield np.array(x_context_prev_audio), np.array(x_context_prev_notes), np.array(x_context_audio), np.sum(np.max(y_context_notes, axis=2), axis=1, keepdims=True)/25, np.sum(np.sum(y_context_notes, axis=2), axis=1, keepdims=True)/25, np.array(y_context_notes)
        except Exception as exc:
            if str(exc) != "'_version'" and str(exc) != 'not v2':
                    print(exc)

def data_generator_multi_process(map_folders):
    max_workers = 8
    items_in_queue = max_workers * 2
    queued_maps = items_in_queue
    cancel = False
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        map_tasks = [executor.submit(full_load_map, map_folder) for map_folder in map_folders[:items_in_queue]]
        while len(map_tasks) > 0:
            map_task = map_tasks.pop(0)
            try:
                if cancel:
                    map_task.cancel()
                    continue
                results = map_task.result()
                for result in results:
                    x_context_prev_audio, x_context_prev_notes, x_context_audio, y_context_notes = result
                    x_context_prev_audio = np.array(x_context_prev_audio, dtype=np.float32)
                    x_context_prev_notes = np.array(x_context_prev_notes, dtype=np.float32)
                    x_context_audio = np.array(x_context_audio, dtype=np.float32)
                    y_context_notes = np.array(y_context_notes, dtype=np.float32)
                    yield (x_context_prev_audio), (x_context_prev_notes), (x_context_audio), np.sum(np.max(y_context_notes, axis=2), axis=1, keepdims=True)/25, np.sum(np.sum(y_context_notes, axis=2), axis=1, keepdims=True)/25, (y_context_notes)
            except InterruptedError as ke:
                cancel = True
            except Exception as exc:
                if str(exc) != "'_version'" and str(exc) != 'not v2':
                    print(exc)
            finally:
                if not cancel:
                    queued_maps += 1
                    if queued_maps < len(map_folders):
                        map_tasks.append(executor.submit(full_load_map, map_folders[queued_maps]))
                
# def data_generator(map_folders):
#     try:
#         with ProcessPoolExecutor(max_workers=4) as executor:
#             results = executor.map(load_map_asd, map_folders)

#             for result in results:
#                 print(result)
#                 if result is not None:
#                     yield result
#     except Exception as e:
#         traceback.print_exc()
#         print(e)

In [4]:
def create_ds_for_files(map_folders, batch_size, cache=False, shuffle=False):
    ds = tf.data.Dataset.from_generator(data_generator_multi_process, args=[map_folders], output_signature=(
        tf.TensorSpec(shape=(None, 87, 129), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 50, 217), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 87, 129), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 50, 217), dtype=tf.float32),
        # tf.TensorSpec(shape=(None, 1025, 44), dtype=tf.float32),
        # tf.TensorSpec(shape=(None, 35), dtype=tf.float32),
    ))
    ds = ds.flat_map(lambda x1, x2, x3, x4, x5, y: tf.data.Dataset.from_tensor_slices((x1, x2, x3, x4, x5, y)))

    if cache:
        ds = ds.cache()
    if shuffle:
        ds = ds.shuffle(5000, reshuffle_each_iteration=True)
        # ds = ds.shuffle(len([v for v in ds]), reshuffle_each_iteration=True)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(256)
    return ds

In [None]:
# I currently cache the entire dataset, since the data loading part is quite compute intensive. Added a limit of 50 maps to avoid running out of ram on a test run.
maps = [path.replace("\\", "/") for path in glob.glob(f"{data_dir}/*")][:50]
random.shuffle(maps)

In [6]:
batch_size = 64
train_ds = create_ds_for_files(maps, batch_size, True, True)
# val_ds = create_ds_for_files(maps[:50], batch_size, True, False)

In [7]:
def audio_block():
    input_audio = tf.keras.Input(shape=(87, 129, 1), dtype="float32")
    l = input_audio
    l = tf.keras.layers.Conv2D(32, 3, activation="relu", padding='same')(l)
    l = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), padding='same')(l)
    l = tf.keras.layers.Conv2D(32, 3, activation="relu", padding='same')(l)
    l = tf.keras.layers.Reshape((44, -1))(l)
    l = tf.keras.layers.ZeroPadding1D(3)(l)
    l = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(l)
    l = tf.keras.layers.LSTM(64, return_sequences=True)(l)
    return tf.keras.Model(input_audio, l)

In [8]:
def make_model():
    input_prev_audio = tf.keras.Input(shape=(87, 129, 1), dtype="float32")
    input_prev_notes = tf.keras.Input(shape=(50, 217), dtype="float32")
    input_audio = tf.keras.Input(shape=(87, 129, 1), dtype="float32")
    input_intensity_1 = tf.keras.Input(shape=(1), dtype="float32")
    input_intensity_2 = tf.keras.Input(shape=(1), dtype="float32")

    audio_l = audio_block()

    l_prev_audio = audio_l(input_prev_audio)
    l_audio = audio_l(input_audio)
    
    l_prev = tf.keras.layers.Concatenate(axis=2)([l_prev_audio, input_prev_notes])
    l_prev = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True))(l_prev)
    l_prev = tf.keras.layers.LSTM(32)(l_prev)
    
    
    l_intensity_1 = tf.keras.layers.RepeatVector(50)(input_intensity_1)
    l_intensity_2 = tf.keras.layers.RepeatVector(50)(input_intensity_2)
    l_prev = tf.keras.layers.RepeatVector(50)(l_prev)
    l = tf.keras.layers.Concatenate(axis=2)([l_audio, l_prev, l_intensity_1, l_intensity_1, l_intensity_1, l_intensity_2, l_intensity_2, l_intensity_2])
    l = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(l)
    l = tf.keras.layers.LSTM(128, return_sequences=True)(l)
    l = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(217, activation="sigmoid"))(l)
    output = l

    model = tf.keras.Model(inputs = [input_prev_audio, input_prev_notes, input_audio, input_intensity_1, input_intensity_2], outputs = output)
    return model

In [9]:
model = make_model()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 87, 129, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 input_1 (InputLayer)           [(None, 87, 129, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 model (Functional)             (None, 50, 64)       1157216     ['input_1[0][0]',                
                                                                  'input_3[0][0]']          

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [11]:
train_timing_loss_metric = tf.keras.metrics.Mean(name='train_timing_loss')
train_positioning_loss_metric = tf.keras.metrics.Mean(name='train_positioning_loss')
train_loss_metric = tf.keras.metrics.Mean(name='train_loss')

val_timing_loss_metric = tf.keras.metrics.Mean(name='val_timing_loss')
val_positioning_loss_metric = tf.keras.metrics.Mean(name='val_positioning_loss')
val_loss_metric = tf.keras.metrics.Mean(name='val_loss')

loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)

In [12]:
out_value_counts = tf.constant([0]*217, dtype=tf.float32)

for v_batch in tqdm(train_ds):
    out_value_counts = out_value_counts + tf.reduce_sum(v_batch[-1], axis=[0, 1])
# for i, ovc in enumerate(out_value_counts):
#     print(f"{(i-1)} {ovc}")
note_poss_loss_balance = tf.expand_dims(tf.expand_dims(1/tf.maximum(out_value_counts[1:] * 0.000169, 1), 0), 0)
note_poss_loss_balance

1979it [00:05, 358.76it/s]


-1 517127.0
0 4694.0
1 23756.0
2 17569.0
3 8446.0
4 2427.0
5 26532.0
6 3123.0
7 536.0
8 5016.0
9 60.0
10 32.0
11 15.0
12 21657.0
13 8832.0
14 765.0
15 51694.0
16 1196.0
17 800.0
18 12045.0
19 543.0
20 398.0
21 148.0
22 39.0
23 2.0
24 1076.0
25 4227.0
26 224.0
27 430.0
28 209.0
29 109.0
30 76.0
31 13.0
32 17.0
33 17.0
34 14.0
35 5.0
36 83.0
37 95.0
38 18.0
39 509.0
40 31.0
41 61.0
42 1605.0
43 228.0
44 211.0
45 690.0
46 457.0
47 52.0
48 847.0
49 24681.0
50 9004.0
51 2701.0
52 658.0
53 1645.0
54 542.0
55 72.0
56 146.0
57 23.0
58 47.0
59 6.0
60 99.0
61 332.0
62 64.0
63 77.0
64 62.0
65 815.0
66 73.0
67 49.0
68 1109.0
69 4.0
70 51.0
71 38.0
72 2106.0
73 3799.0
74 131.0
75 261.0
76 91.0
77 216.0
78 33.0
79 30.0
80 46.0
81 0.0
82 14.0
83 0.0
84 283.0
85 2172.0
86 182.0
87 9002.0
88 450.0
89 634.0
90 22513.0
91 418.0
92 441.0
93 1096.0
94 413.0
95 6.0
96 2661.0
97 4900.0
98 3713.0
99 4922.0
100 543.0
101 2512.0
102 2791.0
103 167.0
104 520.0
105 266.0
106 147.0
107 15.0
108 112.0
109 57.0
110 

<tf.Tensor: shape=(1, 1, 216), dtype=float32, numpy=
array([[[1.        , 0.24908063, 0.33679548, 0.7005872 , 1.        ,
         0.22301973, 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 0.27322155, 0.66996825, 1.        ,
         0.11446512, 1.        , 1.        , 0.4912544 , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 0.23974553,
         0.6571701 , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.   

### Custom loss
Calculates 2 separate values:
 - timing loss - simple loss based on when the notes were placed
 - positioning loss - loss based on the position and direction of the placed note, adjusted for the number of notes that appear in different positions to avoid a massive bias towards placing most commonly appearing notes

In [13]:
@tf.function
def custom_loss(y, predictions):
    loss_matrix = tf.square(tf.abs(y - predictions))
    timing_loss_matrix = loss_matrix[:, :, 0]
    positioning_loss_matrix = loss_matrix[:, :, 1:] * y[:, :, :1] * (y[:, :, 1:] * note_poss_loss_balance + 0.0169)
    timing_loss = tf.reduce_mean(timing_loss_matrix)
    positioning_loss = tf.reduce_sum(positioning_loss_matrix) / tf.reduce_sum(y[:, :, 1:]) * 0.5
    loss = timing_loss + positioning_loss
    return timing_loss, positioning_loss, loss

In [14]:
@tf.function
def train_step(model, optimizer, data):
    x1, x2, x3, x4, x5, y = data
    
    with tf.GradientTape() as tape:
        predictions = model([x1, x2, x3, x4, x5], training=True)
        with tf.device('/CPU:0'):
            timing_loss, positioning_loss, loss = custom_loss(y, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_timing_loss_metric(timing_loss)
    train_positioning_loss_metric(positioning_loss)
    train_loss_metric(loss)

@tf.function
def val_step(model, data):
    x1, x2, x3, x4, x5, y = data
    
    predictions = model([x1, x2, x3, x4, x5], training=False)
    timing_loss, positioning_loss, loss = custom_loss(y, predictions)
        
    val_timing_loss_metric(timing_loss)
    val_positioning_loss_metric(positioning_loss)
    val_loss_metric(loss)

In [50]:
for epoch in range(0, 5):
    train_timing_loss_metric.reset_states()
    train_positioning_loss_metric.reset_states()
    train_loss_metric.reset_states()
    
    val_timing_loss_metric.reset_states()
    val_positioning_loss_metric.reset_states()
    val_loss_metric.reset_states()

    
    with tqdm(train_ds.enumerate(), unit="batch") as _tqdm:
        _tqdm.set_description(f"Epoch train: {epoch}")
        for step, data in _tqdm:
            train_step(model, optimizer, data)
            _tqdm.set_postfix(
                timing_loss=train_timing_loss_metric.result().numpy(),
                positioning_loss=train_positioning_loss_metric.result().numpy(),
                loss=train_loss_metric.result().numpy(),
            )
    
    if 'val_ds' in locals() or 'val_ds' in globals():
        with tqdm(val_ds.enumerate(), unit="batch") as _tqdm:
            _tqdm.set_description(f"Epoch val: {epoch}")
            for step, data in _tqdm:
                val_step(model, data)
                _tqdm.set_postfix(
                    timing_loss=val_timing_loss_metric.result().numpy(),
                    positioning_loss=val_positioning_loss_metric.result().numpy(),
                    loss=val_loss_metric.result().numpy(),
                )#2302 2007

Epoch train: 0: : 1979batch [02:16, 14.51batch/s, loss=0.174, positioning_loss=0.135, timing_loss=0.0385]
Epoch train: 1: : 1979batch [02:15, 14.64batch/s, loss=0.17, positioning_loss=0.133, timing_loss=0.0369]
Epoch train: 2: : 1979batch [02:15, 14.64batch/s, loss=0.166, positioning_loss=0.13, timing_loss=0.0355] 
Epoch train: 3: : 1979batch [02:35, 12.75batch/s, loss=0.161, positioning_loss=0.127, timing_loss=0.034] 
Epoch train: 4: : 1979batch [02:29, 13.21batch/s, loss=0.157, positioning_loss=0.124, timing_loss=0.0326]


In [None]:
# Haven't yet reached results worth saving :D
# model.save("./models/v1")

### Results validation

- specify the correct folder with maps for which you would want to generate the map
- add maps that you want to use for testing, better to avoid using the maps that already exist in the training dataset to avoid false positives of AI learning a specific map
- add an Expert diff if doesn't exist, currently hardcoded to just override the Expert diff to avoid setting up all the metadata

In [None]:
base_validation_path = "./validation"
os.makedirs(base_validation_path, exist_ok=True)

In [51]:
validation_map = "lifesux"
(song_data, segment_duration), diffs = load_map(base_validation_path + validation_map)

In [None]:
# Based on previous note format, ignore
def validate_model_old(song_data, segment_duration, threshhold, diff):
    context_length = 1
    note_time_delta = 0.05
    note_count = 10
    note_length = 35

    note_iterator = 0

    context_steps = int(context_length / segment_duration) + 1
    step_size = int(note_time_delta / segment_duration) + 1
    
    generated_notes = diff.notes[:10]
    max_val = 0
    for i in tqdm(range(context_steps, song_data.shape[1] - context_steps, step_size)):
        curr_time = i * segment_duration
        
        while len(generated_notes) > note_iterator and generated_notes[note_iterator].time < curr_time:
            note_iterator += 1

        x_precontext_audio = song_data[:, i-context_steps:i]
        x_postcontext_audio = song_data[:, i:i+context_steps]
        
        notes = []
        for _j in range(note_count):
            j = 10 - _j
            curr_iter = note_iterator - j
            if curr_iter < 0 or generated_notes[curr_iter].time < curr_time - context_length:
                encode_empty(notes)
            else:
                encode_to_array(generated_notes[curr_iter], notes, curr_time)
        
        prediction = np.array(model([np.array([x_precontext_audio]), np.array([notes]), np.array([x_postcontext_audio])])[0], dtype=np.float32)

        if max_val < np.max(prediction[1:]):
            max_val = np.max(prediction[1:])
            print(max_val)
            
        
        if np.max(prediction[1:]) > threshhold:
            print("threshhold")
            predicted_note_time = np.minimum(np.maximum(0, prediction[0]), note_time_delta)
            type0line = prediction[1:5]
            type0layer = prediction[5:8]
            type0direction = prediction[8:17]
            type0bombdir = prediction[17]
            
            type1line = prediction[18:22]
            type1layer = prediction[22:25]
            type1direction = prediction[25:34]
            type1bombdir = prediction[34]
            
            type0sum = np.max(type0line) + np.max(type0layer) + np.max(type0direction)
            type1sum = np.max(type1line) + np.max(type1layer) + np.max(type1direction)
            # bombtype = 
            
            if type0sum > type1sum:
                generated_notes.append(Note(predicted_note_time + curr_time, np.argmax(type0line), np.argmax(type0layer), 0, np.argmax(type0direction)))
            else:
                generated_notes.append(Note(predicted_note_time + curr_time, np.argmax(type1line), np.argmax(type1layer), 1, np.argmax(type1direction)))
            generated_notes.sort(key=lambda note: note.time)
            threshhold += 0.015
        else:
            threshhold -= 0.01
    return generated_notes

In [56]:
def validate_model(song_data, segment_duration, timing_threshhold, positioning_threshhold, intensity_1, intensity_2):
    context_length = 1
    prediction_note_count = context_length * 50

    context_steps = int(context_length / segment_duration) + 1
    step_size = context_steps
    
    generated_notes = []
    max_val_timing = 0
    max_val_positioning = 0
    
    prev_note_segment = [[0]*217 for i in range(prediction_note_count)]
    prev_audio_segment = song_data[:context_steps, :]
    
    for i in tqdm(range(context_steps, song_data.shape[0] - context_steps, step_size)):
        curr_time = i * segment_duration
        
        x_context_prev_audio = prev_audio_segment
        x_context_prev_notes = prev_note_segment
        x_context_audio = song_data[i:i+context_steps, :]
        prediction = np.array(model([np.array([x_context_prev_audio]), np.array([x_context_prev_notes]), np.array([x_context_audio]), np.array([intensity_1]), np.array([intensity_2])], training=False)[0], dtype=np.float32)
        x_context_prev_audio = x_context_audio
        prev_note_segment = [[0]*217 for i in range(prediction_note_count)]

        # I use them to find values that would generate a reasonable number of notes.
        # Small adjustments to the model and it's loss function can significantly shift the actual number values.
        if max_val_timing < np.max(prediction[:, 0]):
            max_val_timing = np.max(prediction[:, 0])
            print(f"max_timing: {max_val_timing}")
        if max_val_positioning < np.max(prediction[:, 1:]):
            max_val_positioning = np.max(prediction[:, 1:])
            print(f"max_positioning: {max_val_positioning}")
        
        for j in range(prediction_note_count):
            curr_note_time = curr_time + j * 0.02
            prediction_timing = prediction[j][0]
            if prediction_timing < timing_threshhold:
                continue
            prediction_positioning = prediction[j][1:]
            
            # Place only 1 note per timing or to place many notes. More than 1 note can get super repetitive more easily, but both are of quite poor quality so far.
            max_one_note_per_placement = True
            if max_one_note_per_placement:
                note_prediction_iter = np.argmax(prediction_positioning)
                prediction_positioning_enumerated = [(note_prediction_iter, prediction_positioning[note_prediction_iter])]
            else:
                prediction_positioning_enumerated = [(i, note_prediction) for i, note_prediction in enumerate(prediction_positioning) if note_prediction > positioning_threshhold]

            for note_prediction_iter, note_prediction in prediction_positioning_enumerated:
                    line_layer = note_prediction_iter % 3
                    line_index = int(note_prediction_iter / 3) % 4
                    direction = int(note_prediction_iter / 12) % 9
                    color = int(note_prediction_iter / 108)
                    generated_notes.append(Note(curr_note_time, line_index, line_layer, color, direction))
                    prev_note_segment[j][0] = 1
                    prev_note_segment[j][1 + note_prediction_iter] = 1
    
    generated_notes.sort(key=lambda note: note.time)
    return generated_notes

intensity_timings_per_second = 9 # model input for number of correct timings per second
base_intensity_notes_per_second = intensity_timings_per_second * 2 # model input for sum of '1's in the prediction segment. Increasing this value should result in more stacks and sliders.
generated_notes = validate_model(song_data, segment_duration, timing_threshhold=0.35, positioning_threshhold=0.8, intensity_1=intensity_timings_per_second/25, intensity_2=base_intensity_notes_per_second/25)

if len(generated_notes) > 0:
    average_notes_per_second = len(generated_notes)/generated_notes[-1].time
else:
    average_notes_per_second = -1

print(f"Average nps: {average_notes_per_second}")

  1%|          | 2/171 [00:00<00:13, 12.65it/s]

tim: 0.8456912636756897
pos: 0.8791457414627075
tim: 0.9187914133071899
pos: 0.9310385584831238
tim: 0.9562196731567383


  4%|▎         | 6/171 [00:00<00:13, 12.55it/s]

pos: 0.9365254044532776
tim: 0.9650704860687256


  7%|▋         | 12/171 [00:00<00:12, 12.90it/s]

tim: 0.9686254262924194
tim: 0.9719038605690002


 12%|█▏        | 20/171 [00:01<00:11, 13.17it/s]

pos: 0.9486384987831116


 20%|█▉        | 34/171 [00:02<00:10, 13.49it/s]

tim: 0.9722039103507996
tim: 0.9756019711494446


 22%|██▏       | 38/171 [00:02<00:09, 13.46it/s]

tim: 0.9804081916809082


 30%|███       | 52/171 [00:03<00:08, 13.71it/s]

tim: 0.9809088706970215


 33%|███▎      | 56/171 [00:04<00:08, 13.82it/s]

pos: 0.9522011280059814


 54%|█████▍    | 92/171 [00:06<00:05, 13.85it/s]

tim: 0.9848037958145142


100%|██████████| 171/171 [00:12<00:00, 13.60it/s]

7.5992377264321425





In [57]:
with open(base_validation_path + validation_map + "/Info.dat", "rb") as f:
    info_json = json.load(f)
    bpm = info_json["_beatsPerMinute"]
    
with open(base_validation_path + validation_map + "/ExpertStandard.dat", "rb") as f:
    diff_json = json.load(f)

diff_json["_notes"] = [{"_time": note.time / 60 * bpm, "_lineIndex": int(note.lineIndex), "_lineLayer": int(note.lineLayer), "_type": int(note.type), "_cutDirection": int(note.direction)} for note in generated_notes]
with open(base_validation_path + validation_map + "/ExpertStandard.dat", "w") as f:
    json.dump(diff_json, f)
    
shutil.make_archive(base_validation_path + validation_map, 'zip', base_validation_path + validation_map)

'E:\\bs-map-generator\\src\\validation\\mazule.zip'