In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

dataset_path = kagglehub.dataset_download("luishcaldernb/morosidad")
csv_path = os.path.join(dataset_path, "data.csv")
df = pd.read_csv(csv_path)

In [7]:
df['exp_sf'].fillna(df['exp_sf'].mean(), inplace=True) #Reeplaza poniendo inplace=True
df.dropna(subset=['linea_sf'], inplace=True)
df.dropna(subset=['deuda_sf'], inplace=True)
original_df = pd.get_dummies(df, columns=['vivienda','nivel_educ'], drop_first=False)
original_df = original_df.drop('zona', axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['exp_sf'].fillna(df['exp_sf'].mean(), inplace=True) #Reeplaza poniendo inplace=True


In [8]:
original_df

Unnamed: 0,mora,atraso,edad,dias_lab,exp_sf,nivel_ahorro,ingreso,linea_sf,deuda_sf,score,clasif_sbs,vivienda_ALQUILADA,vivienda_FAMILIAR,vivienda_PROPIA,nivel_educ_SECUNDARIA,nivel_educ_SIN EDUCACION,nivel_educ_TECNICA,nivel_educ_UNIVERSITARIA
1,0,18,32,4598,9.000000,12,900.00,1824.67,1933.75,175,1,False,True,False,False,False,True,False
2,0,0,26,5148,8.000000,2,2400.00,2797.38,188.29,187,0,False,True,False,False,False,False,True
4,0,0,46,3960,32.401583,1,3100.00,2000.00,11010.65,189,0,False,True,False,False,False,True,False
5,0,22,25,4874,9.000000,12,2200.00,449.92,496.58,220,0,False,True,False,False,False,False,True
6,0,9,30,3930,12.000000,8,2100.00,4827.64,850.21,193,0,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8394,0,0,43,7917,106.000000,12,13660.00,121543.40,8338.46,224,1,False,False,True,False,False,False,True
8395,1,0,29,3352,25.000000,0,813.00,2200.00,0.00,197,1,False,True,False,False,False,False,True
8396,0,0,32,6821,19.000000,12,3000.00,10087.00,933.68,200,1,False,True,False,False,False,False,True
8397,1,0,38,3882,7.000000,12,3069.23,23994.00,13954.65,221,1,False,False,True,False,False,False,True


In [9]:
import numpy as np
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata
original_df['data_type'] = 'original'

print("Original data shape:", original_df.shape)
print("\nOriginal columns:")
print(original_df.columns.tolist())

Original data shape: (6942, 19)

Original columns:
['mora', 'atraso', 'edad', 'dias_lab', 'exp_sf', 'nivel_ahorro', 'ingreso', 'linea_sf', 'deuda_sf', 'score', 'clasif_sbs', 'vivienda_ALQUILADA', 'vivienda_FAMILIAR', 'vivienda_PROPIA', 'nivel_educ_SECUNDARIA', 'nivel_educ_SIN EDUCACION', 'nivel_educ_TECNICA', 'nivel_educ_UNIVERSITARIA', 'data_type']


In [12]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=original_df.drop('data_type', axis=1))

synthetizer = GaussianCopulaSynthesizer(metadata)
synthetizer.fit(original_df.drop('data_type', axis=1))

synthetic_df = synthetizer.sample(num_rows=len(original_df))

synthetic_df['data_type'] = 'synthetic'

print("Synthetic data shape:", synthetic_df.shape)

# Compare distributions
print("\nOriginal data - target (mora) distribution:")
print(original_df['mora'].value_counts(normalize=True) * 100, "%")
print("\nSynthetic data - target (mora) distribution:")
print(synthetic_df['mora'].value_counts(normalize=True) * 100, "%")



Synthetic data shape: (6942, 19)

Original data - target (mora) distribution:
mora
1    69.648516
0    30.351484
Name: proportion, dtype: float64 %

Synthetic data - target (mora) distribution:
mora
1    68.726592
0    31.273408
Name: proportion, dtype: float64 %


In [13]:
original_df.to_csv('original_processed.csv', index=False)
synthetic_df.to_csv('synthetic_data.csv', index=False)

In [14]:
def prepare_for_model(df):
    # Separate features and target
    y = df['mora']
    X = df.drop(['mora', 'data_type'], axis=1)
    return X, y

# Prepare both datasets
X_orig, y_orig = prepare_for_model(original_df)
X_synth, y_synth = prepare_for_model(synthetic_df)

print("Original data shapes:")
print(f"X: {X_orig.shape}, y: {y_orig.shape}")
print("\nSynthetic data shapes:")
print(f"X: {X_synth.shape}, y: {y_synth.shape}")

# Show a few samples from each
print("\nFirst few rows of original features:")
print(X_orig.head())
print("\nFirst few rows of synthetic features:")
print(X_synth.head())

Original data shapes:
X: (6942, 17), y: (6942,)

Synthetic data shapes:
X: (6942, 17), y: (6942,)

First few rows of original features:
   atraso  edad  dias_lab     exp_sf  nivel_ahorro  ingreso  linea_sf  \
1      18    32      4598   9.000000            12    900.0   1824.67   
2       0    26      5148   8.000000             2   2400.0   2797.38   
4       0    46      3960  32.401583             1   3100.0   2000.00   
5      22    25      4874   9.000000            12   2200.0    449.92   
6       9    30      3930  12.000000             8   2100.0   4827.64   

   deuda_sf  score  clasif_sbs  vivienda_ALQUILADA  vivienda_FAMILIAR  \
1   1933.75    175           1               False               True   
2    188.29    187           0               False               True   
4  11010.65    189           0               False               True   
5    496.58    220           0               False               True   
6    850.21    193           0               False          

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_and_evaluate(X, y, data_type):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test_scaled)
    
    print(f"\nResults for {data_type} data:")
    print(classification_report(y_test, y_pred))
    
    return model, scaler

# Train and evaluate on original data
print("Training on original data...")
original_model, original_scaler = train_and_evaluate(X_orig, y_orig, "original")

# Train and evaluate on synthetic data
print("\nTraining on synthetic data...")
synthetic_model, synthetic_scaler = train_and_evaluate(X_synth, y_synth, "synthetic")

Training on original data...

Results for original data:
              precision    recall  f1-score   support

           0       0.85      0.69      0.76       416
           1       0.88      0.95      0.91       973

    accuracy                           0.87      1389
   macro avg       0.87      0.82      0.84      1389
weighted avg       0.87      0.87      0.87      1389


Training on synthetic data...

Results for synthetic data:
              precision    recall  f1-score   support

           0       0.42      0.07      0.12       456
           1       0.68      0.95      0.79       933

    accuracy                           0.66      1389
   macro avg       0.55      0.51      0.46      1389
weighted avg       0.59      0.66      0.57      1389



In [17]:
# Compare feature importance between models
def plot_feature_importance(model, features, title):
    importance = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importances_
    })
    importance = importance.sort_values('importance', ascending=False)
    
    print(f"\nTop 10 important features - {title}:")
    print(importance.head(10))
    return importance

# Compare feature importance
orig_importance = plot_feature_importance(original_model, X_orig.columns, "Original Data")
synth_importance = plot_feature_importance(synthetic_model, X_synth.columns, "Synthetic Data")

# Compare top features between original and synthetic
print("\nFeature importance correlation:")
merged_importance = orig_importance.merge(
    synth_importance, 
    on='feature', 
    suffixes=('_orig', '_synth')
)
correlation = merged_importance['importance_orig'].corr(merged_importance['importance_synth'])
print(f"Correlation between original and synthetic feature importance: {correlation:.3f}")


Top 10 important features - Original Data:
        feature  importance
6      linea_sf    0.133304
3        exp_sf    0.129917
5       ingreso    0.128818
2      dias_lab    0.119364
7      deuda_sf    0.113507
8         score    0.104375
1          edad    0.097516
0        atraso    0.050934
9    clasif_sbs    0.035574
4  nivel_ahorro    0.035248

Top 10 important features - Synthetic Data:
        feature  importance
3        exp_sf    0.120907
6      linea_sf    0.119050
5       ingreso    0.118960
2      dias_lab    0.118504
7      deuda_sf    0.117225
8         score    0.108447
1          edad    0.089758
0        atraso    0.081823
9    clasif_sbs    0.029447
4  nivel_ahorro    0.027856

Feature importance correlation:
Correlation between original and synthetic feature importance: 0.983


In [18]:
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer

# Function to generate and evaluate synthetic data
def generate_and_evaluate(synthesizer, name):
    print(f"\n=== Using {name} ===")
    
    # Fit and generate
    synthesizer.fit(original_df.drop('data_type', axis=1))
    synthetic_df = synthesizer.sample(num_rows=len(original_df))
    synthetic_df['data_type'] = 'synthetic'
    
    # Prepare data
    X_synth, y_synth = prepare_for_model(synthetic_df)
    
    # Compare distributions
    print("\nTarget Distribution Comparison:")
    print("Original:")
    print(y_orig.value_counts(normalize=True) * 100, "%")
    print("\nSynthetic:")
    print(y_synth.value_counts(normalize=True) * 100, "%")
    
    # Train and evaluate
    print(f"\nTraining model on {name} synthetic data...")
    synthetic_model, _ = train_and_evaluate(X_synth, y_synth, f"synthetic ({name})")
    
    return synthetic_df, synthetic_model

In [19]:

# Try CTGAN
ctgan = CTGANSynthesizer(
    metadata,
    epochs=100,  # Increase for better results
    batch_size=500,
    discriminator_steps=1,
    generator_steps=1
)
ctgan_df, ctgan_model = generate_and_evaluate(ctgan, "CTGAN")

TypeError: CTGANSynthesizer.__init__() got an unexpected keyword argument 'generator_steps'