In [None]:
# Install library

!pip install sdv > /dev/null 2>&1

In [None]:
from google.colab import drive
import os
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.utils import shuffle

# Mount Google Drive
drive.mount('/content/drive')

# File paths
dir = '/content/drive/MyDrive/HIV-Exp-Project/Data'
file_path = os.path.join(dir, "clients_data.csv")

# Load and clean
df = pd.read_csv(file_path)

# ✅ Filter for only valid states and convert to string (not category dtype)
valid_states = ["Bauchi", "Jigawa", "Kano"]
df = df[df["State"].isin(valid_states)].copy()
df["State"] = df["State"].astype(str)

# Drop rows with missing satisfaction
df.dropna(subset=["Visit_Satisfaction"], inplace=True)

# Remove EnumID if it exists
df_model = df.drop(columns=["EnumID"], errors="ignore")

# Define sample sizes for synthetic generation
minority_targets = {
    "Satisfied": 270,
    "Neutral": 370,
    "Very dissatisfied": 370
}

synthetic_samples = []
synthetic_id_counter = 1

# Loop through each minority class and generate synthetic data
for target_value, num_samples in minority_targets.items():
    print(f"\nGenerating samples for: {target_value}")

    subset = df_model[df_model["Visit_Satisfaction"] == target_value].copy()

    # ✅ Ensure state values are valid and string-typed
    subset = subset[subset["State"].isin(valid_states)].copy()
    subset["State"] = subset["State"].astype(str)

    # Build metadata
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(subset)

    # Explicitly mark 'State' as categorical if it's not already
    if metadata.columns["State"]["sdtype"] != "categorical":
        metadata.update_column(column_name="State", sdtype="categorical")

    # Train and sample
    model = CTGANSynthesizer(metadata)
    model.fit(subset)

    synth = model.sample(num_rows=num_samples)

    # Add synthetic IDs
    synth["EnumID"] = [
        f"SYNTH-{target_value[:3].upper()}-{i+1}"
        for i in range(synthetic_id_counter, synthetic_id_counter + num_samples)
    ]
    synthetic_id_counter += num_samples

    synthetic_samples.append(synth)
    print(f"✅ Generated {num_samples} samples for: {target_value}")

# Combine synthetic and original data
synthetic_df = pd.concat(synthetic_samples, ignore_index=True)
synthetic_df["Source"] = "Synthetic"
df["Source"] = "Original"

# Merge and shuffle
balanced_df = pd.concat([df, synthetic_df], ignore_index=True)
balanced_df = shuffle(balanced_df, random_state=42)

# Save final dataset
output_path = os.path.join(dir, "balanced.csv")
balanced_df.to_csv(output_path, index=False)
print(f"\n✅ Saved balanced dataset to: {output_path}")

Mounted at /content/drive

Generating samples for: Satisfied




✅ Generated 270 samples for: Satisfied

Generating samples for: Neutral




✅ Generated 370 samples for: Neutral

Generating samples for: Very dissatisfied




✅ Generated 370 samples for: Very dissatisfied

✅ Saved balanced dataset to: /content/drive/MyDrive/HIV-Exp-Project/Data/balanced.csv


In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
        df,
    synthetic_df,
    metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 33/33 [00:00<00:00, 992.46it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 77.45it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [None]:
from sdv.evaluation.single_table import evaluate_quality
from sdv.metadata import SingleTableMetadata

# Create new metadata that matches the columns in both dataframes
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df.drop(columns=["EnumID", "Source"], errors="ignore"))


quality_report = evaluate_quality(
    df.drop(columns=["EnumID", "Source"], errors="ignore"),
    synthetic_df.drop(columns=["EnumID", "Source"], errors="ignore"),
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 33/33 [00:00<00:00, 1113.73it/s]|
Column Shapes Score: 84.67%

(2/2) Evaluating Column Pair Trends: |██████████| 528/528 [00:02<00:00, 180.60it/s]|
Column Pair Trends Score: 71.9%

Overall Score (Average): 78.28%



In [None]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score
0,Age,TVComplement,0.757779
1,Gender,TVComplement,0.843159
2,Marital Status,TVComplement,0.794699
3,Family Setting,TVComplement,0.850146
4,Num of Children,TVComplement,0.763225
5,Educational Status,TVComplement,0.822427
6,Employment Status,TVComplement,0.827668
7,Monthly Income,TVComplement,0.88704
8,Treatment Regimen,TVComplement,0.884731
9,HIV_Duration_Years,TVComplement,0.850402


In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    df,
    synthetic_df,
    column_name='Visit_Satisfaction',
    metadata=metadata
)

fig.show()

In [None]:
balanced_df.head()

Unnamed: 0,EnumID,State,Age,Gender,Marital Status,Family Setting,Num of Children,Educational Status,Employment Status,Monthly Income,...,Meds_Explained_SideFX,Encourage_Questions,Respond_Q_Concerns,Showed_Personal_Concern,Involved_In_Decisions,Discuss_NextSteps,Checked_Understanding,Time_Spent_Adequate,Visit_Satisfaction,Source
1117,SYNTH-NEU-628,Bauchi,25-34,Female,Single,Monogamy,3-4,Primary education,Self-employed,"101,000–200,000 Naira",...,Agree,Agree,Agree,Neither Agree or Disagree,Agree,Neither Agree or Disagree,Strongly Agree,Agree,Neutral,Synthetic
643,SYNTH-SAT-154,Kano,55 years and above,Female,Single,Monogamy,Greater than 4,No formal education,Self-employed,"51,000–100,000 Naira",...,Strongly Agree,Agree,Agree,Disagree,Strongly Agree,Agree,Agree,Agree,Satisfied,Synthetic
422,BC023,Bauchi,35–44,Female,Married,Monogamy,1-2,Secondary education,Unemployed,Prefer not to say,...,Agree,Agree,Disagree,Disagree,Strongly Agree,Strongly Agree,Strongly Agree,Agree,Very satisfied,Original
413,KN27,Kano,55 years and above,Female,Divorced,Monogamy,Greater than 4,No formal education,Unemployed,Prefer not to say,...,Strongly Agree,Disagree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Very satisfied,Original
451,BC023,Bauchi,35–44,Male,Married,Monogamy,3-4,Secondary education,Other (please specify),"51,000–100,000 Naira",...,Agree,Neither Agree or Disagree,Agree,Agree,Agree,Strongly Agree,Agree,Agree,Very satisfied,Original


In [None]:
balanced_df['Visit_Satisfaction'].value_counts()

Unnamed: 0_level_0,count
Visit_Satisfaction,Unnamed: 1_level_1
Neutral,376
Satisfied,376
Very satisfied,375
Very dissatisfied,374
