In [2]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import numpy as np
import pandas as pd

df = pd.read_csv('imputed_plataformas_otc.csv')

# Separar as colunas numéricas e categóricas/binárias
colunas_numericas = df.select_dtypes(include=[np.number]).columns
colunas_categoricas = df.select_dtypes(exclude=[np.number]).columns

def gerar_dados_sinteticos_rf(df, num_novos_dados):
    novos_dados = pd.DataFrame(columns=df.columns)
    
    for i in range(num_novos_dados):
        dado_sintetico = []
        
        # Prever colunas numéricas
        for coluna_alvo in colunas_numericas:
            X = df.drop(columns=[coluna_alvo])
            y = df[coluna_alvo]
            
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X, y)
            
            amostra_aleatoria = X.sample(n=1, random_state=np.random.randint(10000))
            valor_previsto = rf_model.predict(amostra_aleatoria)
            dado_sintetico.append(valor_previsto[0])
        
        # Prever colunas categóricas
        for coluna_alvo in colunas_categoricas:
            X = df.drop(columns=[coluna_alvo])
            y = df[coluna_alvo]
            
            rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
            rf_model.fit(X, y)
            
            amostra_aleatoria = X.sample(n=1, random_state=np.random.randint(10000))
            valor_previsto = rf_model.predict(amostra_aleatoria)
            dado_sintetico.append(valor_previsto[0])
        
        novos_dados.loc[i] = dado_sintetico
    
    return novos_dados



# Gerar novos dados sintéticos
df_novos_dados_rf = gerar_dados_sinteticos_rf(df, 1000)

# Salvar
df_novos_dados_rf.to_csv('dados_sinteticos_rf.csv', index=False)


In [3]:
import pandas as pd
from sdmetrics.reports.single_table import QualityReport
from sdv.metadata import SingleTableMetadata

# Carregar os dados originais e sintéticos
df_original = pd.read_csv('imputed_plataformas_otc.csv')  # Seu dataset original
df_sintetico = pd.read_csv('dados_sinteticos_rf.csv')  # Dados sintéticos gerados

metadata_dict = {
    "columns": {
        "water_depth (m)": {
            "type": "numerical",
            "pii": False,
            "sdtype": "numerical"
        },
        "weight (t)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        },
        "installation_date": {
            "type": "numerical",
            "pii": False,
            "sdtype": "numerical"
        },
        "type_of_production (1 oil and gas; 2 oil; 3 gas)": {
            "type": "numerical",
            "pii": False,
            "sdtype": "numerical"
        },
        "number_of_legs": {
            "type": "numerical",
            "pii": False,
            "sdtype": "numerical"
        },
        "number_of_piles": {
            "type": "numerical",
            "pii": False,
            "sdtype": "numerical"
        },
        "height_of_jacket_or_sub-structure (m)": {
            "type": "numerical",
            "pii": False,
            "sdtype": "numerical"
        },
        "distance_to_coast (km)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        },
        "energy_consumption-complete (GJ)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        },
        "energy_consumption-partial (GJ)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        },
        "emissions-complete (t)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        },
        "emissions-partial (t)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        },
        "recommended (1 partial; 2 complete)": {
            "type": "numerical",
            "pii": True,
            "sdtype": "numerical"
        }
    },
    "primary_key": "water_depth (m)",
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

# Gerar o relatório de qualidade entre os dados reais e sintéticos
report = QualityReport()
report.generate(real_data=df_original, synthetic_data=df_sintetico, metadata=metadata_dict)

# Exibir detalhes específicos do relatório
print(report.get_details(property_name='Column Shapes'))
print(report.get_details(property_name='Column Pair Trends'))


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<?, ?it/s]|
Column Shapes Score: 73.07%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:00<00:00, 183.22it/s]|
Column Pair Trends Score: 72.94%

Overall Score (Average): 73.0%

                                              Column        Metric   Score
0                                    water_depth (m)  KSComplement  0.8125
1                                         weight (t)  KSComplement  0.7745
2                                  installation_date  KSComplement  0.8125
3   type_of_production (1 oil and gas; 2 oil; 3 gas)  KSComplement  0.4375
4                                     number_of_legs  KSComplement  0.6250
5                                    number_of_piles  KSComplement  0.5000
6              height_of_jacket_or_sub-structure (m)  KSComplement  0.7995
7                             distance_to_coast (km)  KSComplement  0.8125
8                   energy_consumption-complete (GJ)  K