<a href="https://colab.research.google.com/github/AnastasiiaVoll/-Geopolitics-of-Renewable-Energy-time-varying-interactions-between-geopolitical-risk-and-renewable/blob/main/Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#...............................................................................
#1 Preprocessing
# libraries
!pip install pandas tensorflow scikit-learn

# General imports
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.animation import FuncAnimation
import imageio

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Keras imports for the model
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU, BatchNormalization, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal

# Upploading the data
file_names = [f'pub{i:02}20.csv' for i in range(1, 13)]
dataframes = []
for file_name in file_names:
    df = pd.read_csv(file_name)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)
print(data.head())
print(data.dtypes)
print(data.describe())

# Summary of categorical variables
for col in data.select_dtypes(include=['object']).columns:
    print(data[col].value_counts())

# Not nedeed vars
data.drop(['FINALWT', 'REC_NUM'], axis=1, inplace=True)

# Dictionary for each categorical variable with their mappings + learning the data
mappings = {'LFSSTAT': {1: 'Employed, at work', 2: 'Employed, absent from work', 3: 'Unemployed', 4: 'Not in labour force'},
    'PROV': {10: 'Newfoundland and Labrador', 11: 'Prince Edward Island', 12: 'Nova Scotia', 13: 'New Brunswick',
             24: 'Quebec', 35: 'Ontario', 46: 'Manitoba', 47: 'Saskatchewan', 48: 'Alberta', 59: 'British Columbia'},
    'CMA': {1: 'Quebec', 2: 'Montreal', 3: 'Ottawa-Gatineau (Ontario part)', 4: 'Toronto', 5: 'Hamilton',
            6: 'Winnipeg', 7: 'Calgary', 8: 'Edmonton', 9: 'Vancouver', 0: 'Other CMA or non-CMA'},
    'AGE_12': {1: '15 to 19 years', 2: '20 to 24 years', 3: '25 to 29 years', 4: '30 to 34 years', 5: '35 to 39 years',
               6: '40 to 44 years', 7: '45 to 49 years', 8: '50 to 54 years', 9: '55 to 59 years', 10: '60 to 64 years',
               11: '65 to 69 years', 12: '70 and over'},
    'AGE_6': {1: '15 to 16 years', 2: '17 to 19 years', 3: '20 to 21 years', 4: '22 to 24 years', 5: '25 to 26 years',
              6: '27 to 29 years', 'blank': 'Not applicable'},
    'SEX': {1: 'Male', 2: 'Female'},
    'MARSTAT': {1: 'Married', 2: 'Living in common-law', 3: 'Widowed', 4: 'Separated', 5: 'Divorced', 6: 'Single, never married'},
    'EDUC': {0: '0 to 8 years', 1: 'Some high school', 2: 'High school graduate', 3: 'Some postsecondary',
             4: 'Postsecondary certificate or diploma', 5: "Bachelor's degree", 6: "Above bachelor's degree"},
    'MJH': {1: 'Single jobholder, including job changers', 2: 'Multiple jobholder', 'blank': 'Not applicable'},
    'EVERWORK': {1: 'Yes, within last year', 2: 'Yes, more than 1 year ago', 3: 'No, never worked', 'blank': 'Not applicable'},
    'FTPTLAST': {1: 'Full-time (30 hours or more)', 2: 'Part-time (1 to 29 hours)', 'blank': 'Not applicable'},
    'COWMAIN': {1: 'Public sector employees', 2: 'Private sector employees', 3: 'Self-employed incorporated, with paid help',
                4: 'Self-employed incorporated, no paid help', 5: 'Self-employed unincorporated, with paid help',
                6: 'Self-employed unincorporated, no paid help', 7: 'Unpaid family worker', 'blank': 'Not applicable'},
    'IMMIG': {1: 'Immigrant, landed 10 or less years earlier', 2: 'Immigrant, landed more than 10 years earlier', 3: 'Non-immigrant'},
    'NAICS_21': {1: 'Agriculture', 2: 'Forestry and logging and support activities for forestry', 3: 'Fishing, hunting and trapping',
                 4: 'Mining, quarrying, and oil and gas extraction', 5: 'Utilities', 6: 'Construction', 7: 'Manufacturing - durable goods',
                 8: 'Manufacturing - non-durable goods', 9: 'Wholesale trade', 10: 'Retail trade', 11: 'Transportation and warehousing',
                 12: 'Finance and insurance', 13: 'Real estate and rental and leasing', 14: 'Professional, scientific and technical services',
                 15: 'Business, building and other support services', 16: 'Educational services', 17: 'Health care and social assistance',
                 18: 'Information, culture and recreation', 19: 'Accommodation and food services',
                 20: 'Other services (except public administration)', 21: 'Public administration', 'blank': 'Not applicable'},
    'NOC_10': { 1: 'Management occupations',
        2: 'Business, finance and administration occupations, except management',
        3: 'Natural and applied sciences and related occupations, except management',
        4: 'Health occupations, except management',
        5: 'Occupations in education, law and social, community and government services, except management',
        6: 'Occupations in art, culture, recreation and sport, except management',
        7: 'Sales and service occupations, except management',
        8: 'Trades, transport and equipment operators and related occupations, except management',
        9: 'Natural resources, agriculture and related production occupations, except management',
        10: 'Occupations in manufacturing and utilities, except management',
        'blank': 'Not applicable'},
    'NOC_43': {
        1: 'Legislative and senior management occupations',
        2: 'Specialized middle management occupations',
        3: 'Middle management occupations in retail and wholesale trade and customer services',
        4: 'Middle management occupations in trades, transportation, production and utilities',
        5: 'Professional occupations in finance',
        6: 'Professional occupations in business',
        7: 'Administrative and financial supervisors and specialized administrative occupations',
        8: 'Administrative occupations and transportation logistics occupations',
        9: 'Administrative and financial support and supply chain logistics occupations',
        10: 'Professional occupations in natural sciences',
        11: 'Professional occupations in applied sciences (except engineering)',
        12: 'Professional occupations in engineering',
        13: 'Technical occupations related to natural and applied sciences',
        14: 'Health treating and consultation services professionals',
        15: 'Therapy and assessment professionals',
        16: 'Nursing and allied health professionals',
        17: 'Technical occupations in health',
        18: 'Assisting occupations in support of health services',
        19: 'Professional occupations in law',
        20: 'Professional occupations in education services',
        21: 'Professional occupations in social and community services',
        22: 'Professional occupations in government services',
        23: 'Occupations in front-line public protection services',
        24: 'Paraprofessional occupations in legal, social, community and education services',
        25: 'Assisting occupations in education and in legal and public protection',
        26: 'Care providers and public protection support occupations and student monitors, crossing guards and related occupations',
        27: 'Professional occupations in art and culture',
        28: 'Technical occupations in art, culture and sport',
        29: 'Occupations in art, culture and sport',
        30: 'Support occupations in art, culture and sport',
        31: 'Retail sales and service supervisors and specialized occupations in sales and services',
        32: 'Occupations in sales and services',
        33: 'Sales and service representatives and other customer and personal services occupations',
        34: 'Sales and service support occupations',
        35: 'Technical trades and transportation officers and controllers',
        36: 'General trades',
        37: 'Mail and message distribution, other transport equipment operators and related maintenance workers',
        38: 'Helpers and labourers and other transport drivers, operators and labourers',
        39: 'Supervisors and occupations in natural resources, agriculture and related production',
        40: 'Workers and labourers in natural resources, agriculture and related production',
        41: 'Supervisors, central control and process operators in processing, manufacturing and utilities and aircraft assemblers and inspectors',
        42: 'Machine operators, assemblers and inspectors in processing, manufacturing and printing',
        43: 'Labourers in processing, manufacturing and utilities',
        'blank': 'Not applicable'},
     'YABSENT': {
        0: 'Other reasons',
        1: 'Own illness or disability',
        2: 'Personal or family responsibilities',
        3: 'Vacation',
        'blank': 'Not applicable'
    },
    'PAYAWAY': {
        1: 'Yes',
        2: 'No',
        'blank': 'Not applicable'
    },
    'FTPTMAIN': {
        1: 'Full-time',
        2: 'Part-time',
        'blank': 'Not applicable'
    },
    'YAWAY': {
        0: 'Other reasons',
        1: 'Own illness or disability',
        2: 'Personal or family responsibilities',
        3: 'Vacation or civic holiday',
        4: 'Working short-time',
        'blank': 'Not applicable'
    },
    'WHYPT': {
        0: 'Other reasons',
        1: 'Own illness or disability',
        2: 'Caring for children',
        3: 'Other personal or family responsibilities',
        4: 'Going to school',
        5: 'Personal preference',
        6: 'Business conditions or could not find full-time work, looked for full-time work in the last month',
        7: 'Business conditions or could not find full-time work, did not look for full-time work in the last month',
        'blank': 'Not applicable'
    },
    'UNION': {
        1: 'Union member',
        2: 'Not a member but covered by a union contract or collective agreement',
        3: 'Non-unionized',
        'blank': 'Not applicable'
    },


    'PERMTEMP': {
        1: 'Permanent',
        2: 'Temporary, seasonal job',
        3: 'Temporary, term or contract job',
        4: 'Temporary, casual or other temporary jobs',
        'blank': 'Not applicable'
    },
    'ESTSIZE': {
        1: 'Less than 20 employees',
        2: '20 to 99 employees',
        3: '100 to 500 employees',
        4: 'More than 500 employees',
        'blank': 'Not applicable'
    },
    'FIRMSIZE': {
        1: 'Less than 20 employees',
        2: '20 to 99 employees',
        3: '100 to 500 employees',
        4: 'More than 500 employees',
        'blank': 'Not applicable'
    },
    'FLOWUNEM': {
        1: 'Job losers, temporary layoff',
        2: 'Job losers, permanent layoff',
        3: 'Job leavers',
        4: 'Job leavers/losers (status unknown), worked more than 1 year ago',
        5: 'New entrants',
        6: 'Re-entrants, worked 1 year ago or less',
        7: 'Re-entrants, worked more than 1 year ago',
        8: 'Future starts',
        'blank': 'Not applicable'
    },
    'UNEMFTPT': {
        1: 'Full-time',
        2: 'Part-time',
        3: 'Future starts',
        'blank': 'Not applicable'
    },
    'WHYLEFTO': {
        0: 'Job leavers, other reasons',
        1: 'Job leavers, own illness or disability',
        2: 'Job leavers, personal or family responsibilities',
        3: 'Job leavers, going to school',
        4: 'Job losers, laid off',
        5: 'Job leavers, retired',
        'blank': 'Not applicable'
    },
    'WHYLEFTN': {
        0: 'Job leavers, other reasons',
        1: 'Job leavers, own illness or disability',
        2: 'Job leavers, caring for children',
        3: 'Job leavers, pregnancy',
        4: 'Job leavers, personal or family responsibilities',
        5: 'Job leavers, going to school',
        6: 'Job leavers, dissatisfied',
        7: 'Job leavers, retired',
        8: 'Job leavers, business sold or closed down (self-employed)',
        9: 'Job losers, end of seasonal job (employee)',
        10: 'Job losers, end of temporary or casual (employee)',
        11: 'Job losers, company moved or out of business (employee)',
        12: 'Job losers, business conditions (employee)',
        13: 'Job losers, dismissal or other reasons',
        'blank': 'Not applicable'
    },
    'AVAILABL': {
        1: 'Not available',
        2: 'Yes, available',
        'blank': 'Not applicable'
    },
    'LKPUBAG': {
        1: 'Yes',
        'blank': 'Not applicable'
    },
    'LKEMPLOY': {
        1: 'Yes',
        'blank': 'Not applicable'
    },
    'LKRELS': {
        1: 'Yes',
        'blank': 'Not applicable'
    },
    'LKATADS': {
        1: 'Yes',
        'blank': 'Not applicable'
    },
    'LKANSADS': {
        1: 'Yes',
        'blank': 'Not applicable'
    },
    'LKOTHERN': {
        1: 'Yes',
        'blank': 'Not applicable'
    },
    'PRIORACT': {
        0: 'Other',
        1: 'Working',
        2: 'Managing a home',
        3: 'Going to school',
        'blank': 'Not applicable'
    },
    'YNOLOOK': {
        0: 'Wanted work, reason - other',
        1: 'Wanted work, reason - own illness or disability',
        2: 'Wanted work, reason - caring for children',
        3: 'Wanted work, reason - other personal or family responsibilities',
        4: 'Wanted work, reason - school',
        5: 'Wanted work, reason - awaiting recall or reply',
        6: 'Wanted work, reason - discouraged',
        'blank': 'Not applicable'
    },
    'TLOLOOK': {
        1: 'Yes',
        2: 'No',
        'blank': 'Not applicable'
    },
    'SCHOOLN': {
        1: 'Non-student',
        2: 'Full-time student',
        3: 'Part-time student',
        'blank': 'Not applicable'
    },
    'EFAMTYPE': {
        1: 'Person not in an economic family',
        2: 'Dual-earner couple, no children or none under 25',
        3: 'Dual-earner couple, youngest child 0 to 17',
        4: 'Dual-earner couple, youngest child 18 to 24',
        5: 'Single-earner couple, male employed, no children or none under 25',
        6: 'Single-earner couple, male employed, youngest child 0 to 17',
        7: 'Single-earner couple, male employed, youngest child 18 to 24',
        8: 'Single-earner couple, female employed, no children or none under 25',
        9: 'Single-earner couple, female employed, youngest child 0 to 17',
        10: 'Single-earner couple, female employed, youngest child 18 to 24',
        11: 'Non-earner couple, no children or none under 25',
        12: 'Non-earner couple, youngest child 0 to 17',
        13: 'Non-earner couple, youngest child 18 to 24',
        14: 'Lone-parent family, parent employed, youngest child 0 to 17',
        15: 'Lone-parent family, parent employed, youngest child 18 to 24',
        16: 'Lone-parent family, parent not employed, youngest child 0 to 17',
        17: 'Lone-parent family, parent not employed, youngest child 18 to 24',
        18: 'Other families',
        'blank': 'Not applicable'
    },
    'AGYOWNK': {
        1: 'Youngest child less than 6 years',
        2: 'Youngest child 6 to 12 years',
        3: 'Youngest child 13 to 17 years',
        4: 'Youngest child 18 to 24 years',
        'blank': 'Not applicable'
    }}

# Applying the mappings
for variable, mapping in mappings.items():
    data[variable] = data[variable].map(mapping).fillna('Not applicable')

# Categorical variables
categorical_vars = [
    'SURVYEAR', 'SURVMNTH', 'LFSSTAT', 'PROV', 'CMA', 'AGE_12', 'AGE_6', 'SEX',
    'MARSTAT', 'EDUC', 'MJH', 'EVERWORK', 'FTPTLAST', 'COWMAIN', 'IMMIG', 'NAICS_21',
    'NOC_10', 'NOC_43', 'YABSENT', 'PAYAWAY', 'FTPTMAIN', 'YAWAY', 'WHYPT', 'UNION',
    'PERMTEMP', 'ESTSIZE', 'FIRMSIZE', 'FLOWUNEM', 'UNEMFTPT', 'WHYLEFTO', 'WHYLEFTN',
    'AVAILABL', 'LKPUBAG', 'LKEMPLOY', 'LKRELS', 'LKATADS', 'LKANSADS', 'LKOTHERN',
    'PRIORACT', 'YNOLOOK', 'TLOLOOK', 'SCHOOLN', 'EFAMTYPE', 'AGYOWNK'
]

# Continuous variables
continuous_vars = [
    'WKSAWAY', 'UHRSMAIN', 'AHRSMAIN', 'UTOTHRS', 'ATOTHRS', 'HRSAWAY',
    'PAIDOT', 'UNPAIDOT', 'XTRAHRS', 'TENURE', 'PREVTEN', 'HRLYEARN',
    'DURUNEMP', 'DURJLESS'
]

# Replacing any variable that has a 'blank' as a category with 'Not applicable'
data.replace({np.nan: 'Not applicable'}, inplace=True)
print(data.head())

# Substitute 'not applicable' with 0 in continuous variables
for var in continuous_vars:
    data[var] = data[var].replace('Not applicable', 0)

# Convert all other variables to categorical
categorical_vars = [col for col in data.columns if col not in continuous_vars]
data[categorical_vars] = data[categorical_vars].astype('category')

# Automatically generate min and max values for each variable
min_max_values = {var: (data[var].min(), data[var].max()) for var in continuous_vars}

# Verify the data types of each variable
print(data.dtypes)

# Summary statistics for continuous variables
continuous_summary = data[continuous_vars].describe()

# Value counts for categorical variables
categorical_summary = {}
for var in categorical_vars:
    categorical_summary[var] = data[var].value_counts()

# Data types of each variable
data_types = data.dtypes

# Minimum and Maximum for continuous variables
min_values = data[continuous_vars].min()
max_values = data[continuous_vars].max()

# Combining all the information into a summary DataFrame
data_summary = pd.DataFrame({
    'Data Type': data_types,
    'Min': min_values,
    'Max': max_values
})

print("Data Summary:")
print(data_summary)
print("\nContinuous Variables Summary:")
print(continuous_summary)
print("\nCategorical Variables Summary:")
for var, counts in categorical_summary.items():
    print(f"\n{var}:\n{counts}")

# Normalizing Continuous Vars
scaler = StandardScaler()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# One-Hot Encoding Categorical Variables
encoders = {var: OneHotEncoder(sparse=False, handle_unknown='ignore') for var in categorical_vars}
for var in categorical_vars:
    transformed = encoders[var].fit_transform(data[[var]].astype(str))
    ohe_df = pd.DataFrame(transformed, columns=[f"{var}_{category}" for category in encoders[var].categories_[0]])
    data = pd.concat([data.drop(columns=[var]), ohe_df], axis=1)



#...............................................................................
#2 ML section - building up the cGAN + training
# Splitting Data into Features (X) and Conditions (y)
condition_columns = [col for col in data.columns if 'IMMIG_' in col or 'EDUC_' in col]
X = data.drop(columns=condition_columns).values
y = data[condition_columns].values

# Save column names for later use, to make the plots
column_names = [col for col in data.columns if col not in condition_columns]
print(condition_columns)

# The cGAN architecture
latent_dim = 100

def build_generator(latent_dim, condition_shape, data_shape):
    noise = Input(shape=(latent_dim,))
    condition = Input(shape=(condition_shape,))
    concat = Concatenate()([noise, condition])

    x = Dense(256)(concat)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization(momentum=0.8)(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization(momentum=0.8)(x)
    x = Dense(1024)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(np.prod(data_shape), activation='tanh')(x)

    model = Model([noise, condition], x)
    return model

def build_discriminator(data_shape, condition_shape):
    data_input = Input(shape=(data_shape,))
    condition = Input(shape=(condition_shape,))
    concat = Concatenate()([data_input, condition])

    x = Dense(512)(concat)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dropout(0.4)(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model([data_input, condition], x)
    return model

generator = build_generator(latent_dim, y.shape[1], X.shape[1])
discriminator = build_discriminator(X.shape[1], y.shape[1])

# Compiling the discriminator
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])

# Combined model as a result
discriminator.trainable = False
noise = Input(shape=(latent_dim,))
condition = Input(shape=(y.shape[1],))
generated_data = generator([noise, condition])
validity = discriminator([generated_data, condition])
combined = Model([noise, condition], validity)
combined.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

# Corrected check for the generator's input shape
generator_input_shapes = [input_shape for input_shape in generator.input_shape]

# Checking if the generator's input shapes are correctly formed
if generator_input_shapes[0][1] != latent_dim:
    raise ValueError("Generator's noise input shape does not match the expected latent dimension.")

if generator_input_shapes[1][1] != y.shape[1]:
    raise ValueError("Generator's condition input shape does not match the expected condition shape.")

# Checking if the discriminator's input shapes are correctly formed
discriminator_input_shapes = [input_shape for input_shape in discriminator.input_shape]

if discriminator_input_shapes[0][1] != X.shape[1]:
    raise ValueError("Discriminator's data input shape does not match the expected data shape.")

if discriminator_input_shapes[1][1] != y.shape[1]:
    raise ValueError("Discriminator's condition input shape does not match the expected condition shape.")

# Validation during training
def train(epochs, batch_size=128):
    for epoch in range(epochs):
        # Randomly selecting a batch of real data
        idx = np.random.randint(0, X.shape[0], batch_size)
        real_data = X[idx]
        real_conditions = y[idx]

        # Generating a batch of fake data
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        gen_data = generator.predict([noise, real_conditions])

        # Training the discriminator
        d_loss_real = discriminator.train_on_batch([real_data, real_conditions], np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch([gen_data, real_conditions], np.zeros((batch_size, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Training the generator
        g_loss = combined.train_on_batch([noise, real_conditions], np.ones((batch_size, 1)))

        # Checking for NaN values in loss
        if np.isnan(d_loss).any() or np.isnan(g_loss):
            raise ValueError(f"NaN value detected in loss at epoch {epoch}")

        # Print progress
        if epoch % 100 == 0:
            print(f"Epoch {epoch} / {epochs}, [D loss: {d_loss}], [G loss: {g_loss}]")

# Training the Model
train(epochs=200, batch_size=32)

# Model summary, checking input sizes
generator.summary()


#...............................................................................
#3 ML section - applying the cGAN -> generating syntetic data for 1:7 scenarios
# Conditions for the scenarios
scenario_conditions = {
    'scenario_1': ['0 to 8 years'],
    'scenario_2': ['Some high school'],
    'scenario_3': [ 'High school graduate'],
    'scenario_4': ['Some postsecondary'],
    'scenario_5': ['Postsecondary certificate or diploma'],
    'scenario_6': ['Bachelor\'s degree'],
    'scenario_7': ['Above bachelor\'s degree']
}

def encode_conditions(immig_status, educ_status, num_samples):
    conditions = np.zeros((num_samples, y.shape[1]))

    # Prep the data
    immig_data = np.array([immig_status] * num_samples).reshape(-1, 1)
    educ_data = np.array([educ_status] * num_samples).reshape(-1, 1)

    # Encoding immigration status
    immig_encoded = encoders['IMMIG'].transform(immig_data)
    if isinstance(immig_encoded, np.ndarray):
        conditions[:, :immig_encoded.shape[1]] = immig_encoded
    else:
        conditions[:, :immig_encoded.shape[1]] = immig_encoded.toarray()

    # Encoding education status
    educ_encoded = encoders['EDUC'].transform(educ_data)
    if isinstance(educ_encoded, np.ndarray):
        conditions[:, immig_encoded.shape[1]:immig_encoded.shape[1] + educ_encoded.shape[1]] = educ_encoded
    else:
        conditions[:, immig_encoded.shape[1]:immig_encoded.shape[1] + educ_encoded.shape[1]] = educ_encoded.toarray()

    return conditions

def generate_synthetic_data(generator, num_samples, condition):
    # Generating the needed noise
    noise = np.random.normal(0, 1, (num_samples, latent_dim))

    # Generating the synthetic data
    synthetic_data = generator.predict([noise, condition])

    return synthetic_data

def postprocess_synthetic_data(synthetic_data, encoders, scaler, continuous_vars, min_max_values, categorical_vars, column_names):
    synthetic_df = pd.DataFrame(synthetic_data, columns=column_names)

    # Reversing normalization for continuous variables
    synthetic_continuous = pd.DataFrame(scaler.inverse_transform(synthetic_df[continuous_vars]), columns=continuous_vars)

    # Enforcing natural min and max boundaries for each continuous variable
    for var in continuous_vars:
        min_val, max_val = min_max_values[var]
        synthetic_continuous[var] = np.clip(synthetic_continuous[var], min_val, max_val)

    synthetic_df[continuous_vars] = synthetic_continuous

    # Applying inverse transformation for categorical variables (excluding 'EDUC' and 'IMMIG')
    for var in categorical_vars:
        if var not in ['EDUC', 'IMMIG']:
            encoded_cols = [col for col in column_names if col.startswith(var + "_")]
            if encoded_cols:
                synthetic_df[var] = encoders[var].inverse_transform(synthetic_df[encoded_cols])

    # Not needed, drop the one-hot encoded columns as they are no longer needed
    for var in categorical_vars:
        encoded_cols = [col for col in column_names if col.startswith(var + "_")]
        synthetic_df.drop(encoded_cols, axis=1, inplace=True)

    return synthetic_df

# Generating Synthetic Profiles
num_samples = 1000

for scenario, educ_statuses in scenario_conditions.items():
    for educ_status in educ_statuses:
        immig_status = 'Immigrant, landed 10 or less years earlier'
        num_samples = 1000

        condition = encode_conditions(immig_status, educ_status, num_samples)

        synthetic_data = generate_synthetic_data(generator, num_samples, condition)

        synthetic_profiles = postprocess_synthetic_data(synthetic_data, encoders, scaler, continuous_vars, min_max_values, categorical_vars, column_names)

        file_name = f'synthetic_profiles_{scenario}_{educ_status.replace(" ", "_")}.csv'
        synthetic_profiles.to_csv(file_name, index=False)
        print(f'Saved: {file_name}')
        generator.summary()
        print(synthetic_profiles.head())

# Loading the synthetic profiles for the specified scenarios
synthetic_profiles_scenario_1 = pd.read_csv('synthetic_profiles_scenario_1_0_to_8_years.csv')
synthetic_profiles_scenario_5 = pd.read_csv('synthetic_profiles_scenario_5_Postsecondary_certificate_or_diploma.csv')
synthetic_profiles_scenario_7 = pd.read_csv('synthetic_profiles_scenario_7_Above_bachelor\'s_degree.csv')

# Listing of the loaded profiles for convenience
profiles_list = [
    (synthetic_profiles_scenario_1, 'Immigrants with 0 to 8 years of education'),
    (synthetic_profiles_scenario_5, 'Immigrants with a postsecondary certificate or diploma'),
    (synthetic_profiles_scenario_7, 'Immigrants with education above a bachelor\'s degree')
]


#...............................................................................
#4 Results - visualising the results
# order  the industries
industry_order = sorted(synthetic_profiles_scenario_1['NAICS_21'].unique())

def creating_plot(ax, title, xlabel=None, ylabel='Count', rotation=45, legend_title=None):
    ax.set_title(title, fontsize=16, weight='bold')
    ax.set_xlabel(xlabel, fontsize=10, weight='bold')
    ax.set_ylabel(ylabel, fontsize=10, weight='bold')
    plt.setp(ax.get_xticklabels(), rotation=rotation, ha="right")
    sns.despine(ax=ax)
    if legend_title:
        ax.legend(title=legend_title)
    ax.grid(axis='y', linestyle='--', alpha=0.6)

# Function to plot the distribution for a given variable, separated by sex, excluding 'Not applicable'
def plot_distribution_by_sex(data, variable, title_prefix, file_prefix, scenario_name):
    if variable in ['NAICS_21', 'COWMAIN', 'CMA', 'AGE_12']:
        order = sorted(data[data[variable] != 'Not applicable'][variable].unique())
    else:
        order = None

    for sex in ['Male', 'Female']:
        plt.figure(figsize=(12, 7))
        sex_data = data[(data['SEX'] == sex) & (data[variable] != 'Not applicable')]

        if variable in ['HRLYEARN']:
            sns.histplot(sex_data, x=variable, kde=True, bins=30, color='skyblue')
        else:
            category_counts = sex_data[variable].value_counts(normalize=True) * 100
            category_counts = category_counts.reindex(order).fillna(0)  # Ensuring consistent order
            sns.barplot(x=category_counts.index, y=category_counts.values, order=order)
        creating_plot(plt.gca(), f'{title_prefix} - {sex} immigrants {scenario_name}', variable, ylabel='Percentage', rotation=45)

        # Save
        plt.tight_layout()
        plt.savefig(f'{file_prefix}_{sex}.png')
        plt.close()

# Variables to plot
variables_to_plot = ['HRLYEARN', 'NAICS_21', 'COWMAIN', 'CMA', 'AGE_12']
variable_titles = {
    'HRLYEARN': 'Hourly Earnings',
    'NAICS_21': 'Employment Sectors',
    'COWMAIN': 'Type of Employment',
    'CMA': 'Province',
    'AGE_12': 'Age'
}
scenario_titles = {
    'scenario_1': 'with 0 to 8 years of education',
    'scenario_5': 'with a postsecondary certificate or diploma',
    'scenario_7': 'with education above a bachelor\'s degree'
}

# Generating plots for each variable and scenario
for variable in variables_to_plot:
    plot_distribution_by_sex(synthetic_profiles_scenario_1, variable, variable_titles[variable], 'scenario_1_' + variable, scenario_titles['scenario_1'])
    plot_distribution_by_sex(synthetic_profiles_scenario_5, variable, variable_titles[variable], 'scenario_5_' + variable, scenario_titles['scenario_5'])
    plot_distribution_by_sex(synthetic_profiles_scenario_7, variable, variable_titles[variable], 'scenario_7_' + variable, scenario_titles['scenario_7'])


   REC_NUM  SURVYEAR  SURVMNTH  LFSSTAT  PROV  CMA  AGE_12  AGE_6  SEX  \
0        1      2020         1        1    35    4       7    NaN    2   
1        2      2020         1        3    35    4       2    4.0    2   
2        3      2020         1        1    35    4       4    NaN    1   
3        4      2020         1        1    48    8       8    NaN    2   
4        5      2020         1        1    24    2       8    NaN    2   

   MARSTAT  ...  LKATADS  LKANSADS  LKOTHERN  PRIORACT  YNOLOOK  TLOLOOK  \
0        1  ...      NaN       NaN       NaN       NaN      NaN      NaN   
1        6  ...      1.0       1.0       NaN       3.0      NaN      NaN   
2        2  ...      NaN       NaN       NaN       NaN      NaN      NaN   
3        2  ...      NaN       NaN       NaN       NaN      NaN      NaN   
4        6  ...      NaN       NaN       NaN       NaN      NaN      NaN   

   SCHOOLN  EFAMTYPE  AGYOWNK  FINALWT  
0      1.0         3      3.0      732  
1      1.0      

