# VCSEL Data Analysis with OptoTransformer
## Histogram Construction and Data Distribution Analysis

In this notebook, we will analyze VCSEL data using OptoTransformer.

### Main Objectives
1. **Loading and Preparing Data**:
   - Load data from a database or file.
   - Preprocess data and bring it to the required format.

2. **Histogram Construction**:
   - Construct histograms of **EIGEN_ENERGY_1**, **EIGEN_ENERGY_2**, **Q**, and **TMG**  parameters.
   - Analyze main trends in the distribution.

In [None]:
import os
import yaml
import logging
import argparse
from argparse import Namespace
from src.utils.logging_config import setup_logging, log_config, log_model_params
from scripts.setup_utils import check_paths, validate_config, validate_model_params, default_model_type
from src.constants import EIGEN_ENERGY, QUALITY_FACTOR, THRESHOLD_GAIN
from src.models.create_model import create_model
from src.data.data_processing import process_data
from src.data.data_loader import load_data
from sklearn.model_selection import train_test_split
from src.data.vcsel_dataset import VCSELDataset
from src.training.train_model import train_model
from src.training.train_model_k_fold import train_model_k_fold
from src.training.test_model import test_model
from src.predict.predict import load_prediction_samples, predict

In [None]:
os.chdir('..')
original_dir = os.getcwd()
print(f"Original Working Directory: {original_dir}")

# SINGLE LAYER

## Plot distribution of output energies

In [None]:
with open("config_single.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="eigen_energy"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
train_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in train_dataset]
train_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in train_dataset]

validate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in validate_dataset]
validate_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in validate_dataset]

evaluate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in evaluate_dataset]
evaluate_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in evaluate_dataset]

# Combine train, validate и evaluate
combined_eigen_energy_1 = train_eigen_energy_1 + validate_eigen_energy_1 + evaluate_eigen_energy_1
combined_eigen_energy_2 = train_eigen_energy_2 + validate_eigen_energy_2 + evaluate_eigen_energy_2

with open('SINGLE_eigen_energy_1.txt', 'w') as file1:
    for energy in combined_eigen_energy_1:
        file1.write(f"{energy}\n")

with open('SINGLE_eigen_energy_2.txt', 'w') as file2:
    for energy in combined_eigen_energy_2:
        file2.write(f"{energy}\n")


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_1")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(eigen_energy_2, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_2")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()

### Distributin for Datasets

In [None]:
plot_distribution(train_dataset)
plot_distribution(validate_dataset)
plot_distribution(evaluate_dataset)

## Plot distribution of output Quality Factor

In [None]:
with open("config_single.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="quality_factor"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
from src.data.utilities import log_denormalize
train_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in train_dataset]

validate_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in validate_dataset]

evaluate_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in evaluate_dataset]

# Combine train, validate и evaluate
combined_quality_factor = train_quality_factor + validate_quality_factor + evaluate_quality_factor


with open('SINGLE_quality_factor.txt', 'w') as file1:
    for energy in combined_quality_factor:
        file1.write(f"{energy}\n")


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from src.data.utilities import log_denormalize

def plot_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.hist(quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from src.data.utilities import log_denormalize

%matplotlib inline

def plot_log_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]
    log_quality_factor = np.log10(quality_factor)
    plt.figure(figsize=(10, 5))
    print(min(log_quality_factor))
    print(max(log_quality_factor))
    plt.hist(log_quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of log of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")

### Distributin for log datasets

In [None]:
plot_log_distribution(train_dataset, "train")
plot_log_distribution(validate_dataset, "validate")
plot_log_distribution(evaluate_dataset, "test")

# Plot distribution of output Threshold Material Gain

In [None]:
with open("config_single.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="threshold_material_gain"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
from src.data.utilities import convert_k_to_gain

train_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in train_dataset]
train_k = [output["output_tokens"][1].item() for output in train_dataset]
train_gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(train_eigen_energy_1, train_k)]

validate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in validate_dataset]
validate_k = [output["output_tokens"][1].item() for output in validate_dataset]
validate_gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(validate_eigen_energy_1, validate_k)]

evaluate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in evaluate_dataset]
evaluate_k = [output["output_tokens"][1].item() for output in evaluate_dataset]
evaluate_gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(evaluate_eigen_energy_1, evaluate_k)]

# Combine train, validate и evaluate
combined_eigen_energy_1 = train_eigen_energy_1 + validate_eigen_energy_1 + evaluate_eigen_energy_1
combined_gain = train_gain + validate_gain + evaluate_gain

with open('SINGLE_eigen_energy.txt', 'w') as file1:
    for energy in combined_eigen_energy_1:
        file1.write(f"{energy}\n")

with open('SINGLE_gain.txt', 'w') as file2:
    for energy in combined_gain:
        file2.write(f"{energy}\n")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset, title):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    k = [output["output_tokens"][1].item() for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of Energy")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(k, bins=20, alpha=0.7)
    plt.title(f"Distribution of Threshold Material Gain for {title} dataset")
    plt.xlabel("k")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

### Distributin for datasets for gain in sm^-1

In [None]:
import matplotlib.pyplot as plt
from src.data.utilities import convert_k_to_gain
%matplotlib inline


def plot_distribution_gain_case(dataset, title):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    k = [output["output_tokens"][1].item() for output in dataset]
    gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(eigen_energy_1, k)]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of Energy")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(gain, bins=20, alpha=0.7)
    plt.title(f"Distribution of Threshold Material Gain for {title} dataset")
    plt.xlabel("Gain, $cm^{-1}$")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()



### Distributin for datasets

In [None]:
plot_distribution_gain_case(train_dataset, "train")
plot_distribution_gain_case(validate_dataset, "validate")
plot_distribution_gain_case(evaluate_dataset, "test")

# DBR

## Plot distribution of output energies

In [None]:
with open("config_DBR.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="eigen_energy"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
train_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in train_dataset]
train_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in train_dataset]

validate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in validate_dataset]
validate_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in validate_dataset]

evaluate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in evaluate_dataset]
evaluate_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in evaluate_dataset]

# Combine train, validate и evaluate
combined_eigen_energy_1 = train_eigen_energy_1 + validate_eigen_energy_1 + evaluate_eigen_energy_1
combined_eigen_energy_2 = train_eigen_energy_2 + validate_eigen_energy_2 + evaluate_eigen_energy_2

with open('DBR_eigen_energy_1.txt', 'w') as file1:
    for energy in combined_eigen_energy_1:
        file1.write(f"{energy}\n")

with open('DBR_eigen_energy_2.txt', 'w') as file2:
    for energy in combined_eigen_energy_2:
        file2.write(f"{energy}\n")


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_1")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(eigen_energy_2, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_2")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()

### Distributin for Datasets

In [None]:
plot_distribution(train_dataset)
plot_distribution(validate_dataset)
plot_distribution(evaluate_dataset)

## Plot distribution of output Quality Factor

In [None]:
with open("config_DBR.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="quality_factor"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
from src.data.utilities import log_denormalize
train_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in train_dataset]

validate_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in validate_dataset]

evaluate_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in evaluate_dataset]

# Combine train, validate и evaluate
combined_quality_factor = train_quality_factor + validate_quality_factor + evaluate_quality_factor


with open('DBR_quality_factor.txt', 'w') as file1:
    for energy in combined_quality_factor:
        file1.write(f"{energy}\n")


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from src.data.utilities import log_denormalize

def plot_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.hist(quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from src.data.utilities import log_denormalize

%matplotlib inline

def plot_log_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]
    log_quality_factor = np.log10(quality_factor)
    plt.figure(figsize=(10, 5))
    print(min(log_quality_factor))
    print(max(log_quality_factor))
    plt.hist(log_quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of log of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")

### Distributin for log datasets

In [None]:
plot_log_distribution(train_dataset, "train")
plot_log_distribution(validate_dataset, "validate")
plot_log_distribution(evaluate_dataset, "test")

# VCSEL

## Plot distribution of output energies

In [None]:
with open("config_VCSEL.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="eigen_energy"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
train_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in train_dataset]
train_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in train_dataset]

validate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in validate_dataset]
validate_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in validate_dataset]

evaluate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in evaluate_dataset]
evaluate_eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in evaluate_dataset]

# Combine train, validate и evaluate
combined_eigen_energy_1 = train_eigen_energy_1 + validate_eigen_energy_1 + evaluate_eigen_energy_1
combined_eigen_energy_2 = train_eigen_energy_2 + validate_eigen_energy_2 + evaluate_eigen_energy_2

with open('VCSEL_eigen_energy_1.txt', 'w') as file1:
    for energy in combined_eigen_energy_1:
        file1.write(f"{energy}\n")

with open('VCSEL_eigen_energy_2.txt', 'w') as file2:
    for energy in combined_eigen_energy_2:
        file2.write(f"{energy}\n")


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_1")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(eigen_energy_2, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_2")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()

### Distributin for Datasets

In [None]:
plot_distribution(train_dataset)
plot_distribution(validate_dataset)
plot_distribution(evaluate_dataset)

## Plot distribution of output Quality Factor

In [None]:
with open("config_VCSEL.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="quality_factor"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
from src.data.utilities import log_denormalize
train_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in train_dataset]

validate_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in validate_dataset]

evaluate_quality_factor = [log_denormalize(output["output_tokens"].item()) for output in evaluate_dataset]

# Combine train, validate и evaluate
combined_quality_factor = train_quality_factor + validate_quality_factor + evaluate_quality_factor


with open('VCSEL_quality_factor.txt', 'w') as file1:
    for energy in combined_quality_factor:
        file1.write(f"{energy}\n")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from src.data.utilities import log_denormalize

def plot_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.hist(quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from src.data.utilities import log_denormalize

%matplotlib inline

def plot_log_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]
    log_quality_factor = np.log10(quality_factor)
    plt.figure(figsize=(10, 5))
    print(min(log_quality_factor))
    print(max(log_quality_factor))
    plt.hist(log_quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of log of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")

### Distributin for log datasets

In [None]:
plot_log_distribution(train_dataset, "train")
plot_log_distribution(validate_dataset, "validate")
plot_log_distribution(evaluate_dataset, "test")

# Plot distribution of output Threshold Material Gain

In [None]:
with open("config_VCSEL.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="threshold_material_gain"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
from src.data.utilities import convert_k_to_gain

train_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in train_dataset]
train_k = [output["output_tokens"][1].item() for output in train_dataset]
train_gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(train_eigen_energy_1, train_k)]

validate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in validate_dataset]
validate_k = [output["output_tokens"][1].item() for output in validate_dataset]
validate_gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(validate_eigen_energy_1, validate_k)]

evaluate_eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in evaluate_dataset]
evaluate_k = [output["output_tokens"][1].item() for output in evaluate_dataset]
evaluate_gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(evaluate_eigen_energy_1, evaluate_k)]

# Combine train, validate и evaluate
combined_eigen_energy_1 = train_eigen_energy_1 + validate_eigen_energy_1 + evaluate_eigen_energy_1
combined_gain = train_gain + validate_gain + evaluate_gain

with open('VCSEL_eigen_energy.txt', 'w') as file1:
    for energy in combined_eigen_energy_1:
        file1.write(f"{energy}\n")

with open('VCSEL_gain.txt', 'w') as file2:
    for energy in combined_gain:
        file2.write(f"{energy}\n")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset, title):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    k = [output["output_tokens"][1].item() for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of Energy")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(k, bins=50, alpha=0.7)
    plt.title(f"Distribution of Threshold Material Gain for {title} dataset")
    plt.xlabel("k")
    plt.ylabel("Frequency")
    #plt.xlim(0, 0.015)
    plt.tight_layout()
    plt.show()


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

### Distributin for datasets for gain in sm^-1

In [None]:
import matplotlib.pyplot as plt
from src.data.utilities import convert_k_to_gain
%matplotlib inline


def plot_distribution_gain_case(dataset, title):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    k = [output["output_tokens"][1].item() for output in dataset]
    gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(eigen_energy_1, k)]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of Energy")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(gain, bins=50, alpha=0.7)
    plt.title(f"Distribution of Threshold Material Gain for {title} dataset")
    plt.xlabel("Gain, $cm^{-1}$")
    plt.ylabel("Frequency")
    #plt.xlim(0, 2000)
    plt.tight_layout()
    plt.show()



### Distributin for datasets

In [None]:
plot_distribution_gain_case(train_dataset, "train")
plot_distribution_gain_case(validate_dataset, "validate")
plot_distribution_gain_case(evaluate_dataset, "test")

# ALL

## Plot distribution of output energies

In [None]:
with open("config.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="eigen_energy"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    eigen_energy_2 = [output["output_tokens"][1].item() * 10 for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_1")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(eigen_energy_2, bins=20, alpha=0.7)
    plt.title("Distribution of EIGEN_ENERGY_2")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()

### Distributin for Datasets

In [None]:
plot_distribution(train_dataset)
plot_distribution(validate_dataset)
plot_distribution(evaluate_dataset)

## Plot distribution of output Quality Factor

In [None]:
with open("config.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="quality_factor"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from src.data.utilities import log_denormalize

def plot_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.hist(quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from src.data.utilities import log_denormalize

%matplotlib inline

def plot_log_distribution(dataset, title):

    quality_factor = [log_denormalize(output["output_tokens"].item()) for output in dataset]
    log_quality_factor = np.log10(quality_factor)
    plt.figure(figsize=(10, 5))
    print(min(log_quality_factor))
    print(max(log_quality_factor))
    plt.hist(log_quality_factor, bins=35, alpha=0.7)
    plt.title(f"Distribution of log of Quality Factor for {title} dataset")
    plt.xlabel("Q")
    plt.ylabel("Frequency")

### Distributin for log datasets

In [None]:
plot_log_distribution(train_dataset, "train")
plot_log_distribution(validate_dataset, "validate")
plot_log_distribution(evaluate_dataset, "test")

# Plot distribution of output Threshold Material Gain

In [None]:
with open("config_ALL.yaml", 'r') as file:
    config = yaml.safe_load(file)

args = Namespace(
    train=config['runtime']['train'],
    predict=config['runtime']['predict'],
    load_weights=config['runtime']['load_weights'],

    data_path=config['data']['path'],
    model_path=config['model']['params_path'],
    weight_path=config['model']['weight_path'],

    predictions_dir=config['predictions']['directory'],
    samples_file=config['predictions']['samples_file'],

    batch_size=config['training']['batch_size'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    use_scheduler=config['training']['use_scheduler'],
    scheduler_factor=config['training']['scheduler_params']['factor'],
    scheduler_patience=config['training']['scheduler_params']['patience'],
    model_type="threshold_material_gain"
)

In [None]:
data = load_data(args.data_path, args.model_type)
train_data, evaluate_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, validate_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data = process_data(train_data, args.model_path, args.model_type, is_train=True)
train_dataset = VCSELDataset(train_data)

validate_data = process_data(validate_data, args.model_path, args.model_type, is_validate=True)
validate_dataset = VCSELDataset(validate_data)

evaluate_data = process_data(evaluate_data, args.model_path, args.model_type, is_test=True)
evaluate_dataset = VCSELDataset(evaluate_data)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_distribution(dataset, title):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    k = [output["output_tokens"][1].item() for output in dataset]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of Energy")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(k, bins=20, alpha=0.7)
    plt.title(f"Distribution of Threshold Material Gain for {title} dataset")
    plt.xlabel("k")
    plt.ylabel("Frequency")
    #plt.xlim(0, 0.015)
    plt.tight_layout()
    plt.show()


### Distributin for datasets

In [None]:
plot_distribution(train_dataset, "train")
plot_distribution(validate_dataset, "validate")
plot_distribution(evaluate_dataset, "test")

### Distributin for datasets for gain in sm^-1

In [None]:
import matplotlib.pyplot as plt
from src.data.utilities import convert_k_to_gain
%matplotlib inline


def plot_distribution_gain_case(dataset, title):

    eigen_energy_1 = [output["output_tokens"][0].item() * 10 for output in dataset]
    k = [output["output_tokens"][1].item() for output in dataset]
    gain = [-1*convert_k_to_gain(e, k_val) for e, k_val in zip(eigen_energy_1, k)]

    plt.figure(figsize=(10, 5))

    plt.subplot(1, 2, 1)
    plt.hist(eigen_energy_1, bins=20, alpha=0.7)
    plt.title("Distribution of Energy")
    plt.xlabel("Energy, eV")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    plt.hist(gain, bins=20, alpha=0.7)
    plt.title(f"Distribution of Threshold Material Gain for {title} dataset")
    plt.xlabel("Gain, $cm^{-1}$")
    plt.ylabel("Frequency")
    #plt.xlim(0, 2000)
    plt.tight_layout()
    plt.show()



### Distributin for datasets

In [None]:
plot_distribution_gain_case(train_dataset, "train")
plot_distribution_gain_case(validate_dataset, "validate")
plot_distribution_gain_case(evaluate_dataset, "test")