# Predicting Bach Chorales

### Data Extraction

In [64]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.layers import Reshape


In [22]:
import pandas as pd
import pickle
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split

In [None]:
data_root = "data/"
def get_data(first=False):
    df = pd.DataFrame()
    if not first:
        df = pickle.load(open(data_root + "bach.pkl", "rb"))
        return df
    folders = ["test", "train", "valid"]
    for folder in folders:
        for file in os.listdir(data_root + "jsb_chorales/" + folder):
            if file.endswith(".csv"):
                df = pd.concat([df, pd.read_csv(data_root + "jsb_chorales/" + folder + "/" + file)])
    pickle.dump(df, open(data_root + "bach.pkl", "wb"))
    return df

window_size = 48
def get_Xy(df):
    # 48 time steps, y: 49th
    X = []
    y = []
    for i in range(len(df) - window_size):
        X.append(df.iloc[i:i+window_size].values)
        y.append(df.iloc[i+window_size].values)
    return np.array(X), np.array(y)

In [31]:
df = get_data(True)
X, y = get_Xy(df)

In [32]:
X.shape, y.shape

((92488, 48, 4), (92488, 4))

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((73990, 48, 4), (73990, 4), (18498, 48, 4), (18498, 4))

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])

y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
y_test = y_test.reshape(y_test.shape[0], y_test.shape[1], 1)

In [86]:
X_train[0]

array([[69, 66, 61, 54],
       [71, 68, 62, 47],
       [71, 68, 62, 47],
       [71, 68, 62, 47],
       [71, 68, 62, 47],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [68, 65, 61, 49],
       [69, 66, 61, 54],
       [69, 66, 61, 54],
       [69, 66, 61, 54],
       [69, 66, 61, 54],
       [71, 68, 64, 52],
       [71, 68, 64, 52],
       [71, 68, 64, 52],
       [71, 68, 64, 52],
       [73, 69, 64, 57],
       [73, 69, 64, 57],
       [73, 69, 64, 57],
       [73, 69, 64, 57],
       [71, 64, 64, 56],
       [71, 64, 64, 56],
       [73, 64, 64, 56],
       [73, 64, 64, 56],
       [74, 69, 62, 54],
       [74, 69, 62, 54],
       [74, 69, 62, 54],
       [74, 69, 62, 54],
       [73, 69, 64, 52],
       [73, 69, 64, 52],
       [73, 69, 64, 52],
       [73, 69, 64, 52],
       [71, 69, 66, 50],
       [71, 69, 66, 50],
       [71, 69, 66, 50],


In [68]:
model = Sequential([
    Input(shape=(48, 4)),
    GRU(256, return_sequences=True),
    GRU(256, return_sequences=False), 
    Dense(4), 
])

In [69]:
model.summary()

In [87]:
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=1, batch_size=128, validation_split=0.2)

[1m356/463[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m10s[0m 99ms/step - accuracy: 0.0033 - loss: 8.2309

KeyboardInterrupt: 

In [83]:
model.predict(X_test[0:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


array([[71.2924  , 65.73276 , 60.622593, 53.837933]], dtype=float32)

In [111]:
def next_chord(chord):
    nc_unrounded = model.predict(chord.reshape(1, window_size, 4), verbose=0)
    nc = np.round(nc_unrounded).reshape(1, 4)
    return nc

def extend_sequence(sequence, length):
    for i in range(length):
        next_chord_result = next_chord(sequence[-window_size:])
        sequence = np.concatenate((sequence, next_chord_result), axis=0)
    return sequence

extended = extend_sequence(X_test[0], 100)

In [112]:
extended

array([[69., 62., 54., 50.],
       [69., 62., 54., 50.],
       [69., 62., 54., 50.],
       [69., 62., 54., 50.],
       [69., 62., 54., 50.],
       [69., 62., 54., 50.],
       [69., 62., 54., 50.],
       [67., 59., 52., 52.],
       [67., 59., 52., 52.],
       [67., 59., 52., 52.],
       [67., 59., 52., 52.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [66., 62., 57., 50.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 43.],
       [67., 62., 59., 55.],
       [67., 62., 59., 55.],
       [67., 62., 59., 55.],
       [67., 6

In [113]:
#save as csv
df = pd.DataFrame(extended)
df.to_csv("extended_sequence.csv", index=False, header=False)