# Generative Modeling of Financial Data with Matrix Product States
This notebook applies the MPS training method from the paper "Unsupervised Generative Modeling Using Matrix Product States" to a financial dataset.


<ins>Unsupervised Generative Modeling Using Matrix Product States </ins>\
Zhao-Yu Han, Jun Wang, Heng Fan, Lei Wang, and Pan Zhang \
Phys. Rev. X 8, 031012 – Published 17 July 2018 \
[https://doi.org/10.1103/PhysRevX.8.031012](https://doi.org/10.1103/PhysRevX.8.031012)

In [None]:
%cd /workspaces/quantum-research/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scienceplots
plt.style.use(['science','ieee','no-latex'])

## Preprocess dataset

In [None]:
df = pd.read_parquet('data/currencies.parquet')
print(df.head())

# plot
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
df['JPYUSD'].plot(ax=ax[0])
df.drop(columns='JPYUSD').plot(ax=ax[1])
ax[0].set_title('JPYUSD')
ax[1].set_title('Other currencies')
plt.show()


Our dataset contains the daily mid price $P_i$ of each of the four currency pairs.

1. Compute Log Returns: $L_i = log(\frac{P_{i+1}-P_i}{P_i}+1)$

2. Apply Standard Scaler: $S_i = \frac{L_i - \mu_L}{\sigma_L}$

In [None]:
# df = df[["EURUSD", "GBPUSD"]]  # uncomment to use only EURUSD and GBPUSD -> faster computation
num_features = len(df.columns)

# Check for nan values
print("NaN values:")
print(df.isnull().sum())

# Compute Log returns
df = np.log(df.pct_change() + 1) 
df.dropna(inplace=True)  # Drop NaN from the first row

# Apply standard scaler
df = (df - df.mean()) / df.std()

# plot
df.plot()

In [None]:
# plot histograms
df.hist(bins=1000, figsize=(10, 3))
plt.show()

## Binarize

In [None]:
from mps.utils import real_to_binary

# Convert real values to binary
bits_per_feature = 4
df_binary, conv_min_max = real_to_binary(df.values, bits_per_feature)
print("Shape of the binary dataframe:", df_binary.shape)

In [None]:
print("Length of the dataset:", len(df_binary))
unique_samples = np.unique(df_binary, axis=0)
print("Number of unique samples:", len(unique_samples))

# Compute probability of each unique sample
prob = np.zeros(len(unique_samples))
for i, sample in enumerate(unique_samples):
    prob[i] = np.sum(np.all(df_binary == sample, axis=1)) / len(df_binary)

# Compute shannon entropy
shannon_entropy = -np.sum(prob * np.log2(prob))
print("Shannon entropy of the dataset:", shannon_entropy)

In [None]:
from mps.utils import sample_info, array_to_str

def get_features_for_quasi_dist(samples_dict, bits_per_feature, num_features):
    sample_gen_arr, sample_gen_probs = sample_info(samples_dict)
    res_dicts = []
    for i in range(num_features):
        f_arr = array_to_str(sample_gen_arr[:,i*bits_per_feature:(i+1)*bits_per_feature]).tolist()
        f_dict = dict()
        for j in range(len(f_arr)):
            if f_arr[j] in f_dict:
                f_dict[f_arr[j]] += sample_gen_probs[j]
            else:
                f_dict[f_arr[j]] = sample_gen_probs[j]
        assert round(sum([v for v in f_dict.values()]), 12) == 1.0
        res_dicts.append(f_dict)

    return res_dicts

In [None]:
from collections import Counter
from qiskit.visualization import plot_histogram

target_str = array_to_str(df_binary)
target_dict = Counter(target_str)
bin_feat_dicts = get_features_for_quasi_dist(target_dict, bits_per_feature, num_features)

In [None]:
%matplotlib inline

for i, binary_feature in enumerate(bin_feat_dicts):
    fig = plot_histogram(binary_feature, figsize=(10, 3), title=df.columns[i], bar_labels=False)
    plt.show()

## Train MPS

In [None]:
from mps.mps import MPS

m = MPS(bits_per_feature * df.shape[1])
m.left_cano()
m.designate_data(df_binary)
m.init_cumulants()

m.cutoff = 5e-5
m.descent_step_length = 0.05
m.descent_steps = 10
m.train(10)

## Evaluation

Negative Log Likelihood (NLL) Loss Function
- $|\mathcal{T}|$: Size of training set
- $\nu$: Binary Sample from MPS

$$\mathcal{L} = - \frac{1}{|\mathcal{T}|} \sum_{\nu \in \mathcal{T}} ln(\mathbb{P}(\nu))$$

In [None]:
loss = np.array(m.Loss)
plt.plot(loss)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.yscale('log')
plt.show()

In [None]:
# generate 100 samples from the trained MPS
samples_gen = np.full((1000, df_binary.shape[1]), np.nan)
for i in range(1000):
    samples_gen[i] = m.generate_sample_1()

In [None]:
# plot histograms against target
generated_str = array_to_str(samples_gen)
generated_dict = Counter(generated_str)
gen_samples_dicts = get_features_for_quasi_dist(generated_dict, bits_per_feature, num_features)

In [None]:
%matplotlib inline

for i, generated_feature in enumerate(gen_samples_dicts):
    fig = plot_histogram([bin_feat_dicts[i], generated_feature], figsize=(10, 3), title=df.columns[i], bar_labels=False)
    plt.show()