
# Hyperspectral Data Synthesis using CTGAN and VAE
This notebook demonstrates the process of using CTGAN and TVAE synthesizers to generate synthetic hyperspectral data.
The data includes spectral features along with the turbidity target column.


In [None]:

import pandas as pd

# Sample DataFrame df_turb with spectral features and turbidity column
# df_turb = ...


# Model Initialisation & Generation



In [None]:
from sdv.single_table import TVAESynthesizer, CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_turb)
python_dict = metadata.to_dict()
metadata.visualize(
    show_table_details='summarized',
    output_filepath='my_metadata.png'
)


# TVAESynthesizer Initialization
t_vae_synthesizer = TVAESynthesizer(
    epochs=1200,
    enforce_min_max_values=True,
    enforce_rounding=False,
    metadata = metadata
)

# CTGANSynthesizer Initialization
ctgan_synthesizer = CTGANSynthesizer(
    epochs=900,
    enforce_rounding=False,
    verbose=True,
    embedding_dim=100,
    batch_size=12,
    metadata = metadata
)


In [None]:

# Training TVAE Synthesizer
t_vae_synthesizer.fit(df_turb)

# Training CTGAN Synthesizer
ctgan_synthesizer.fit(df_turb)


In [None]:

# Generating synthetic data using TVAE Synthesizer
synthetic_data_tvae = t_vae_synthesizer.sample(num_rows=3000)

# Generating synthetic data using CTGAN Synthesizer
synthetic_data_ctgan = ctgan_synthesizer.sample(num_rows=3000)

# Saving the generated data
synthetic_data_tvae.save_csv('synthetic_data_tvae.csv')
synthetic_data_ctgan.save_csv('synthetic_data_ctgan.csv')

# Visualising Synthetic Data

In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your real data(df_turb) and synthetic data(synthetic_data_tvae) generated by the model
real_df = pd.read_csv('/content/real_df.csv')
synthetic_df = pd.read_csv('/content/synthetic_df.csv')

# Calculate the mean of spectral features across all samples for real and synthetic data
merged_mean_spectrum = real_df.drop('tur', axis=1).mean()
synthetic_mean_spectrum = synthetic_df.drop('tur', axis=1).mean()

# Create a DataFrame for the mean spectrum
mean_spectra_df = pd.DataFrame({
    'Wavelength': merged_mean_spectrum.index,
    'Real_Mean_Pixel': merged_mean_spectrum.values,
    'Synthetic_Mean_Pixel': synthetic_mean_spectrum.values
})

# Melt the DataFrame to long format for Seaborn
long_df = mean_spectra_df.melt(id_vars='Wavelength', var_name='Data_Type', value_name='Mean_Pixel_Value')

# Update the labels
long_df['Data_Type'] = long_df['Data_Type'].str.replace('Real_Mean_Pixel', 'Real')
long_df['Data_Type'] = long_df['Data_Type'].str.replace('Synthetic_Mean_Pixel', 'Synthetic')

# Set the aesthetic style of the plots
sns.set_style('white')
sns.set_context('talk', font_scale=1.2)  # This increases the font size

# Create the line plot with an increased width for the x-axis
plt.figure(figsize=(32, 8))  # Increased width from 14 to 20
sns.lineplot(data=long_df, x='Wavelength', y='Mean_Pixel_Value', hue='Data_Type', palette='tab10')

# Customize the plot to make it more visually appealing
plt.title('Comparison of Continuous Spectra: Real vs. Synthetic Data', fontsize=20)
plt.xlabel('Wavelength / Spectral Channel', fontsize=18)
plt.ylabel('Mean Pixel Intensity', fontsize=18)
plt.legend(title='Data Type', fontsize=16)

# Rotate the x-axis labels by 90 degrees to accommodate all labels
plt.xticks(rotation=90, fontsize= 12)

# Remove the top and right axes spines, which are not needed
sns.despine()

# Show the plot
plt.tight_layout()  # Adjust the layout to fit everything
plt.show()


In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df,
    synthetic_data=synthetic_data,
    column_name='490', # provide the feature or column name of dataset
    metadata=metadata
)

fig.show()

# Evaluating Quality Of Spectral Data

In [None]:

from sdv.evaluation.single_table import evaluate_quality

# Evaluate quality of the synthetic data
quality_report_tvae = evaluate_quality(df_turb, synthetic_data_tvae)
quality_report_ctgan = evaluate_quality(df_turb, synthetic_data_ctgan)

# Save the synthesizers
t_vae_synthesizer.save(filepath='t_vae_synthesizer.pkl')
ctgan_synthesizer.save(filepath='ctgan_synthesizer.pkl')



# Conclusion
This notebook illustrated the steps for synthesizing hyperspectral data using CTGAN and TVAE models.
The generated synthetic data and quality reports can be used for further analysis and model development.
