# Flue gas data generation for the AsPyCC implementation #

## 0. Importing libraries ##

In [None]:
# Importing required libraries
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from SALib.sample import saltelli

# Mute Deprecation warnings (optional)
#import warnings
#warnings.filterwarnings('ignore', category=DeprecationWarning)

## 1. Applying Sobol Sequence to generate samples ##

In [None]:
# Read Flue gas database
df = pd.read_excel('') # The excel file is provided in the repository so the structure can be checked. For more information read the README file.

# Extract the required information and get format needed for Sobol sampling
data_for_sampling = []

for index, row in df.iterrows():
    industry = row['Industry']
    components = []
    ranges = []
    
    # Iterate through columns to check for non-zero ranges
    for component in ['N2', 'O2', 'H2', 'CO', 'CO2', 'CH4', 'H2O']:
        min_val = row[f'{component}_min']
        max_val = row[f'{component}_max']
        if min_val != 0 or max_val != 0:
            components.append(component)
            ranges.append((min_val, max_val))
    
    # Append the result for the current row
    data_for_sampling.append([industry, components, ranges])
    
# Define Sobol sampling in a function that will ensure that flue gas composition sums 100%
def Sobol_sampling(problem_data):
    
    # Check if H2O is present in the flue gas composition
    if 'H2O' not in problem_data[1]:
        
        # Define the problem based on the data availble
        problem = {
            'num_vars': len(problem_data[1]),
            'names': problem_data[1],
            'bounds': problem_data[2]
        }
        
        # Generate Sobol samples
        N = 1024 # Default and recommended value fom SALib library
        samples = saltelli.sample(problem, N)
        
        # Scale the samples so that they sum to 100% for each row
        scaled_samples = samples / samples.sum(axis=1, keepdims=True) * 100
        
        final_sample = scaled_samples
    
    else:
        
        # Define the problem based on the data availble
        problem = {
            'num_vars': len(problem_data[1][:-1]),
            'names': problem_data[1][:-1],
            'bounds': problem_data[2][:-1]
        }
        
        # Generate Sobol samples
        N = 1024 # Default and recommended value fom SALib library
        samples = saltelli.sample(problem, N)
        
        # Compute H2O by balance
        H2O = 100 - np.sum(samples, axis=1)
        
        # Filter valid samples (H2O within its range)
        valid_mask = (H2O >= problem_data[2][-1][0]) & (H2O <= problem_data[2][-1][-1])
        valid_samples = samples[valid_mask]
        H2O = H2O[valid_mask]
        
        # Concatenate valid samples
        final_sample = np.hstack([valid_samples, H2O.reshape(-1, 1)])
    
    return final_sample.T

# Empty list to store the samples for every industry
final_sampling = []

# Iterate by every industry
for data in data_for_sampling:
    
    # Empty list to store the samples generated with the Sobol sampling and tags
    sampling_list = []

    # Generate samples
    species = Sobol_sampling(data)
    
    # Store the data
    sampling_list.append(data[0])
    sampling_list.append(data[1])
    sampling_list.append(species)
    final_sampling.append(sampling_list)

## 2. Allocate the respective flowrates ##

In [None]:
# Prepare DataFrame with samples and flow rates
final_df_list = []

for data in final_sampling:
    industry, components, samples = data
    
    
    if len(samples[0]) < 1000:
        flow_rates = np.linspace(100, 500, len(samples[0]))  # Generate 1000 flow rates
        df = pd.DataFrame(samples[:,:].T, columns=components)
        df['Flowrate (t/h)'] = flow_rates
        df.insert(0, 'Industry', industry)  # Add industry as the first column
        final_df_list.append(df)
        
    else:   
        # Randomly select 1000 samples
        num_samples = samples.shape[1]  # Total number of samples available
        random_indices = np.random.choice(num_samples, size=1000, replace=False)  # Select 1000 random indices without replacement
        samples_1000 = samples[:, random_indices]  # Use the indices to select random samples
               
        # Create DataFrame
        flow_rates = np.linspace(100, 500, 1000)  # Generate 1000 flow rates
        df = pd.DataFrame(samples_1000.T, columns=components)
        df['Flowrate (t/h)'] = flow_rates
        df.insert(0, 'Industry', industry)  # Add industry as the first column
    
    # Append to the final list
    final_df_list.append(df)

# Concatenate all industry DataFrames
final_df = pd.concat(final_df_list, ignore_index=True)
final_df = final_df.reindex(columns=['Industry', 'Flowrate (t/h)', 'N2', 'O2', 'CO2', 'H2O', 'H2', 'CO', 'CH4'])
final_df.fillna(0, inplace=True)

# Save to a file
final_df.to_csv('flue_gas_composition_with_flowrates.csv', index=False)