In [1]:
!uv pip install sdv

[2mUsing Python 3.10.19 environment at: C:\Users\ardav\.conda\envs\compe[0m
[2mResolved [1m40 packages[0m [2min 2.33s[0m[0m
[36m[1mDownloading[0m[39m faker [2m(1.9MiB)[0m
 [32m[1mDownloading[0m[39m faker
[2mPrepared [1m7 packages[0m [2min 4.53s[0m[0m
[2mInstalled [1m7 packages[0m [2min 1.08s[0m[0m
 [32m+[39m [1mcopulas[0m[2m==0.12.3[0m
 [32m+[39m [1mctgan[0m[2m==0.11.1[0m
 [32m+[39m [1mdeepecho[0m[2m==0.7.0[0m
 [32m+[39m [1mfaker[0m[2m==38.2.0[0m
 [32m+[39m [1mrdt[0m[2m==1.18.2[0m
 [32m+[39m [1msdmetrics[0m[2m==0.24.0[0m
 [32m+[39m [1msdv[0m[2m==1.29.1[0m


# Implementasi Generasi Data Sintetis (Week 4)

Notebook ini mengimplementasikan pipeline generasi data sintetis menggunakan algoritma **CTGAN (Conditional Tabular GAN)** dari library `SDV`.

**Tujuan:**
- Menghasilkan dataset sintetis minimal 1.500 baris.
- Mempertahankan karakteristik statistik data asli.
- Menjaga privasi dengan tidak memuat informasi pengenal langsung (Direct Identifiers).

**Alur Kerja:**
1.  **Preprocessing:** Memuat data dan menghapus kolom sensitif (`customerID`).
2.  **Model Training:** Melatih model CTGAN pada data asli.
3.  **Sampling:** Membangkitkan data sintetis.
4.  **Saving:** Menyimpan hasil ke format CSV.

In [2]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data & Preprocessing
print("--- 1. Loading Data ---")
try:
    df = pd.read_csv('data.csv')
    print(f"Data asli dimuat: {df.shape}")
    
    # Hapus Direct Identifiers (customerID)
    if 'customerID' in df.columns:
        df_train = df.drop(columns=['customerID'])
        print("Kolom 'customerID' dihapus untuk pelatihan.")
    else:
        df_train = df.copy()
        
    # Konversi TotalCharges ke numerik (jika belum)
    df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'], errors='coerce')
    df_train['TotalCharges'].fillna(df_train['TotalCharges'].mean(), inplace=True)
    
    display(df_train.head())
    
except FileNotFoundError:
    print("Error: File 'data.csv' tidak ditemukan.")

--- 1. Loading Data ---
Data asli dimuat: (7043, 21)
Kolom 'customerID' dihapus untuk pelatihan.


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# 2. Model Training (CTGAN)
print("\n--- 2. Training CTGAN Model ---")

# Mendeteksi metadata secara otomatis
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_train)

print("Metadata terdeteksi:")
print(metadata.to_dict())

# Inisialisasi Synthesizer
# epochs=100 agar pelatihan cukup cepat namun hasil lumayan (bisa dinaikkan untuk hasil lebih baik)
synthesizer = CTGANSynthesizer(
    metadata,
    epochs=100,
    verbose=True
)

# Melatih model
synthesizer.fit(df_train)
print("Pelatihan model selesai.")


--- 2. Training CTGAN Model ---
Metadata terdeteksi:
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1', 'columns': {'gender': {'sdtype': 'categorical'}, 'SeniorCitizen': {'sdtype': 'categorical'}, 'Partner': {'sdtype': 'categorical'}, 'Dependents': {'sdtype': 'categorical'}, 'tenure': {'sdtype': 'numerical'}, 'PhoneService': {'sdtype': 'categorical'}, 'MultipleLines': {'sdtype': 'categorical'}, 'InternetService': {'sdtype': 'categorical'}, 'OnlineSecurity': {'sdtype': 'categorical'}, 'OnlineBackup': {'sdtype': 'categorical'}, 'DeviceProtection': {'sdtype': 'categorical'}, 'TechSupport': {'sdtype': 'categorical'}, 'StreamingTV': {'sdtype': 'categorical'}, 'StreamingMovies': {'sdtype': 'categorical'}, 'Contract': {'sdtype': 'categorical'}, 'PaperlessBilling': {'sdtype': 'categorical'}, 'PaymentMethod': {'sdtype': 'categorical'}, 'MonthlyCharges': {'sdtype': 'numerical'}, 'TotalCharges': {'sdtype': 'numerical'}, 'Churn': {'sdtype': 'categorical'}}}


Gen. (-1.69) | Discrim. (-0.02): 100%|██████████| 100/100 [01:22<00:00,  1.22it/s]

Pelatihan model selesai.





In [5]:
# 3. Sampling (Generasi Data Sintetis)
print("\n--- 3. Generating Synthetic Data ---")

# Target: Minimal 1.500 sampel (sesuai tugas)
n_samples = 2000 
synthetic_data = synthesizer.sample(num_rows=n_samples)

print(f"Berhasil membangkitkan {len(synthetic_data)} baris data sintetis.")
display(synthetic_data.head())

# 4. Saving Result
output_file = 'output/synthetic_telco_churn.csv'
synthetic_data.to_csv(output_file, index=False)
print(f"\n--- 4. Data disimpan ke '{output_file}' ---")


--- 3. Generating Synthetic Data ---
Berhasil membangkitkan 2000 baris data sintetis.


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,1,Yes,No,72,Yes,No,DSL,No internet service,Yes,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,24.68,776.780986,No
1,Female,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),101.58,8684.8,No
2,Male,1,Yes,No,39,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,No,Yes,One year,No,Electronic check,64.0,2141.807391,No
3,Male,0,Yes,Yes,8,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,28.67,18.8,No
4,Female,0,No,No,6,Yes,No,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,39.53,1792.365286,Yes



--- 4. Data disimpan ke 'output/synthetic_telco_churn.csv' ---


## Preview Dataset hasil Generasi CT-GAN

In [None]:
synthetic_data = pd.read_csv(output_file)

print(f'Total data sintesis yang di generate CT-GAN: {len(synthetic_data)}') 
synthetic_data # preview dataset

Total data sintesis yang di generate CT-GAN: 2000


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,1,Yes,No,72,Yes,No,DSL,No internet service,Yes,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,24.68,776.780986,No
1,Female,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),101.58,8684.800000,No
2,Male,1,Yes,No,39,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,No,Yes,One year,No,Electronic check,64.00,2141.807391,No
3,Male,0,Yes,Yes,8,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,28.67,18.800000,No
4,Female,0,No,No,6,Yes,No,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,39.53,1792.365286,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Male,1,Yes,No,6,Yes,No,Fiber optic,Yes,Yes,No internet service,No,No,No,One year,Yes,Bank transfer (automatic),81.96,751.286327,No
1996,Male,0,Yes,No,11,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),26.31,18.800000,No
1997,Female,0,Yes,Yes,16,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,One year,No,Bank transfer (automatic),106.94,1792.505991,No
1998,Male,0,No,No,29,Yes,No,Fiber optic,No,No internet service,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,118.75,1741.354453,No
