In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate

from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
chunk_size = 100000  # Adjust as needed
chunks = pd.read_csv('E:/Phat/data_electricity.csv', chunksize=chunk_size, low_memory=False, na_values='\\N')

data = pd.concat([chunk for chunk in chunks])

In [3]:
data = data.drop_duplicates()

In [4]:
for col in data.select_dtypes(include=['float64']).columns:
    data[col] = data[col].astype(np.float32)

for col in data.select_dtypes(include=['int64']).columns:
    data[col] = data[col].astype(np.int32)

In [5]:
missing_value_cols = ['ma_ttcto', 'so_cot', 'so_hop', 'ngay_hhluc_vitri_ddo', 
                      'ngay_hhluc_khang','ngay_hhluc_diemdo', 'ngay_hhluc']
for col in missing_value_cols:
    data[col].fillna("Missing", inplace=True)

In [5]:
numerical_cols = ['id_chiso','id_bcs','hs_nhan','so_cto_chiso','ky','thang','nam',
                  'chiso_cu','chiso_moi','san_luong','sluong_ttiep','sluong_trphu',
                   'thd_le','sluong_1','sluong_2','sluong_3','ma_cto','so_cto_hso_cto',
                    'ma_cloai','so_pha','id_khang','id_ddo','kimua_cspk','csuat','so_cto_csuat_ddo',
                     'thang_csuat_ddo','gia_tri' ]

scaler = StandardScaler()
for col in numerical_cols:
    data[col] = scaler.fit_transform(data[[col]])


In [6]:
target = 'fraud'
cat_cols = ['ma_dviqly','ma_dvictren','ma_ddo','bcs_chiso','loai_chiso',
            'ma_ttcto','ngay_dky','ngay_cky','ngay_tao_chiso','nguoi_tao_chiso',
            'ngay_sua_chiso','nguoi_sua_chiso','ma_cnang_chiso','so_cot','so_hop',
            'ma_tram','ngay_hluc_vitri_ddo','ngay_hhluc_vitri_ddo','ngay_bdong',
            'ngay_kdinh','ma_bdong','dong_dien','dien_ap','vh_cong','ten_khang',
            'ma_khang','ngay_hluc_khang','ngay_hhluc_khang','dia_chi','ngay_hluc_diem_do',
            'ngay_hhluc_diemdo','loai_giatri','ngay_hluc','ngay_hhluc']

oe = OrdinalEncoder()
for col in cat_cols:
    data[col] = data[col].astype('category').cat.codes

In [7]:
data = data.astype({col: str for col in data.select_dtypes(include=['object']).columns})
data.to_parquet('data_nonprocess.parquet', index=False)
