In [48]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine 
import configparser 
import datetime

from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import pickle
import sdv

In [36]:
print(sdv.__version__)


1.17.1


In [49]:
#Берем данные по БД

config = configparser.ConfigParser()
config.read('config.ini')
conn_string = config.get('DATABASE', 'connection_url')

In [None]:
#Чтение из БД

In [50]:
table = 'house_prices_train'
query = f''' 
select * from {table}
'''
def reading_from_db(query,table):
    db = create_engine(conn_string)
    
    with db.connect() as conn:
        df = pd.read_sql(query, con=conn)
        
    return df


In [51]:
df = reading_from_db(query,table)

In [52]:
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,dt
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,2024-10-15 11:09:15
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,2024-10-15 11:09:15
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,2024-10-15 11:09:15
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,2024-10-15 11:09:15
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,2024-10-15 11:09:15


In [None]:
#Генерация

In [53]:
metadata = Metadata.detect_from_dataframe(
    data=df,
    table_name='house_prices')

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(df)

synthetic_data = synthesizer.sample(num_rows=10)



In [54]:
synthetic_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,dt
0,982268879,185,RL,69.0,11584,Pave,,Reg,Lvl,AllPub,...,,,,515,4,2007,New,Partial,275890,2024-10-15 11:09:15
1,804069452,108,RL,101.0,8706,Pave,,IR3,Lvl,AllPub,...,,,,575,4,2009,WD,Partial,246788,2024-10-15 11:09:15
2,839063981,48,RL,,11230,Pave,,IR1,Lvl,AllPub,...,,,,2064,8,2009,WD,Normal,225499,2024-10-15 11:09:15
3,82149587,124,RL,71.0,6560,Pave,,Reg,Lvl,AllPub,...,,,,8,9,2006,WD,Partial,164671,2024-10-15 11:09:15
4,204237279,93,RL,91.0,14465,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,7,8,2009,WD,Normal,83705,2024-10-15 11:09:15
5,975382569,184,RL,,1595,Pave,,IR1,Lvl,AllPub,...,,,,3303,4,2007,WD,Normal,91521,2024-10-15 11:09:15
6,952095692,33,RL,,11269,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2006,WD,Normal,116355,2024-10-15 11:09:15
7,152067842,143,RL,44.0,8909,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,52,8,2009,WD,Normal,287552,2024-10-15 11:09:15
8,536692407,64,RL,54.0,8696,Pave,,IR1,Lvl,AllPub,...,,,,6185,7,2008,WD,Abnorml,244868,2024-10-15 11:09:15
9,86284612,98,RL,95.0,11287,Pave,,Reg,Lvl,AllPub,...,,,,3695,6,2008,WD,Normal,158684,2024-10-15 11:09:15


In [55]:
#Сохранение модели для генерации случайных данных 

with open("my_synthesizer.pkl", "wb") as f:
    pickle.dump(synthesizer, f)

In [11]:
synthetic_data['dt'] = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
#Баг с обучением
synthetic_data['3SsnPorch'] = 0

In [56]:
with open('my_synthesizer.pkl','rb') as f:
    synthesizer = pickle.load(f)

In [64]:
synthesizer.sample(num_rows=10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,dt
0,738315232,72,RL,35.0,8376,Pave,,IR1,Lvl,AllPub,...,,,,1524,9,2007,WD,Normal,61173,2024-10-15 11:09:15
1,616422007,131,RL,86.0,12979,Pave,,Reg,Lvl,AllPub,...,,,,16,4,2007,WD,Normal,94036,2024-10-15 11:09:15
2,801989404,172,RL,110.0,12015,Pave,,Reg,Lvl,AllPub,...,,GdPrv,,6123,8,2009,WD,Partial,250879,2024-10-15 11:09:15
3,89341431,99,RL,127.0,27976,Pave,,Reg,Lvl,AllPub,...,,,,4406,6,2007,WD,Normal,363726,2024-10-15 11:09:15
4,126663797,36,FV,61.0,7756,Pave,,Reg,Lvl,AllPub,...,,,,2811,8,2007,WD,Normal,277360,2024-10-15 11:09:15
5,864301360,188,RL,83.0,10903,Pave,,IR1,Lvl,AllPub,...,,,,3907,7,2006,WD,Normal,110215,2024-10-15 11:09:15
6,225627220,69,RL,65.0,4438,Pave,,Reg,Lvl,AllPub,...,,,Shed,8348,11,2008,New,Abnorml,61256,2024-10-15 11:09:15
7,298057445,61,RL,107.0,16571,Pave,,IR1,Lvl,AllPub,...,,,,4748,11,2006,WD,Normal,383649,2024-10-15 11:09:15
8,962873768,181,RL,58.0,10459,Pave,,Reg,Lvl,AllPub,...,,,,412,3,2009,WD,Normal,258279,2024-10-15 11:09:15
9,873711460,53,RL,76.0,26696,Pave,,Reg,Lvl,AllPub,...,,,,2572,7,2006,WD,Normal,81644,2024-10-15 11:09:15


In [None]:
#Запись в БД

In [14]:
#Берем данные по БД

config = configparser.ConfigParser()
config.read('config.ini')
conn_string = config.get('DATABASE', 'connection_url')

In [10]:
#Функция чтения и записи в БД

def upload_to_db(df_in,db_table_out):
    try:

        db = create_engine(conn_string) 
        with db.connect() as conn:
            
            df_in.to_sql(db_table_out, con=conn, if_exists='append',index=False) 
            print('Uploaded')
    finally:
        print('OK')


In [13]:
df_in = synthetic_data
db_table_out = 'house_prices_generator'

upload_to_db(df_in,db_table_out)

Uploaded
OK


In [94]:
import pandas as pd
import numpy as np
import configparser
from sqlalchemy import create_engine  
import datetime
import pickle
from sdv.single_table import GaussianCopulaSynthesizer

config = configparser.ConfigParser()
config.read('/home/aleksey/Notebooks_Projects/House-Prices-Airflow-Superset/Training Models And Test/config.ini')
conn_string = config.get('DATABASE', 'connection_url')


#Таблица с новыми данными которые генерируются

new_data_from_generator = 'house_prices_generator'

filepath = '/home/aleksey/Notebooks_Projects/House-Prices-Airflow-Superset/Training Models And Test/my_synthesizer.pkl'

with open(filepath,'rb') as f:
    synthesizer = pickle.load(f)

def upload_generator_data(new_data_from_generator):



    #Генерация случайных данных
    
    synthetic_data = synthesizer.sample(num_rows=10)

    #Баг с обучением

    synthetic_data['3SsnPorch'] = 0

    #Убираем баг и подставляем дату

    synthetic_data['dt'] = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
    
    engine = create_engine(conn_string) 
    
    '''with engine.connect() as conn:
        synthetic_data.to_sql(new_data_from_generator, con=conn, if_exists='replace',index=False) 
        '''
    
    return synthetic_data

In [101]:
upload_generator_data(new_data_from_generator)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,dt
0,144684934,179,RL,31.0,2567,Pave,,Reg,HLS,AllPub,...,,,,4500,5,2007,New,Partial,67012,2024-10-15 18:21:31
1,822714017,110,RM,47.0,2641,Pave,,Reg,Lvl,AllPub,...,,,,11,9,2008,WD,Normal,99639,2024-10-15 18:21:31
2,387499165,104,RL,66.0,13283,Pave,,Reg,Lvl,AllPub,...,,GdPrv,,8376,8,2009,WD,Normal,224411,2024-10-15 18:21:31
3,819763345,153,RL,74.0,7898,Pave,,Reg,Lvl,AllPub,...,,,,2193,8,2008,WD,Normal,305406,2024-10-15 18:21:31
4,183641747,48,RL,87.0,9172,Pave,,Reg,Lvl,AllPub,...,,GdPrv,,67,4,2007,New,Normal,126326,2024-10-15 18:21:31
5,406675973,94,RL,48.0,8424,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,456,7,2009,WD,Normal,109182,2024-10-15 18:21:31
6,482909667,92,RL,97.0,10513,Pave,,Reg,Lvl,AllPub,...,,,,1615,2,2007,New,Normal,281263,2024-10-15 18:21:31
7,940509559,51,RL,112.0,20655,Pave,,Reg,Lvl,AllPub,...,,,,1,4,2007,WD,Normal,197157,2024-10-15 18:21:31
8,121427097,151,FV,,6577,Pave,,IR1,Lvl,AllPub,...,,,,2009,9,2006,WD,Normal,82311,2024-10-15 18:21:31
9,143302168,138,RL,79.0,8908,Pave,,Reg,Lvl,AllPub,...,,,Shed,1361,8,2007,WD,Normal,185632,2024-10-15 18:21:31
