In [1]:
import pandas as pd
import numpy as np
import os
import warnings

In [2]:
# function to select 20 random rows and set a random feature on them to null
def add_errors(data: pd.DataFrame, size: int) -> pd.DataFrame:
    indexes = np.random.randint(0, len(data) - 1, size)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)    
        for i in indexes:
            column = np.random.choice(data.columns)
            error_type = np.random.choice(['missing', 'incompatible', 'out_of_range', 'incorrect'])
            
            if error_type == 'missing':
                data.loc[i, column] = np.NaN
            
            elif error_type == 'incompatible':
                data.loc[i, column] = 'IncompatibleType'
            
            elif error_type == 'out_of_range': ## setting feature-specific errors
                if column == 'alcohol':
                    data.loc[i, column] = np.random.uniform(0, 8)
                elif column == 'fixed acidity':
                    data.loc[i, column] = np.random.uniform(16, 20)
                if column == 'pH':
                    data.loc[i, column] = np.random.uniform(8, 10)
                elif column == 'volatile acidity':
                    data.loc[i, column] = np.random.uniform(0, 2)
                elif column == 'residual sugar':
                    data.loc[i, column] = np.random.uniform(16, 30)
                elif column == 'chlorides':
                    data.loc[i, column] = np.random.uniform(2, 4)      

            elif error_type == 'incorrect':
                mean_value = np.random.uniform(-2, 0)
                data.loc[i, column] = np.random.normal(mean_value, 1)

    
    print(data.isnull().sum().sum(), 'null values created in data')
    return data

In [3]:
# function to split and save data, takes an input the number of rows and dataframe
def split_and_save_data(data: pd.DataFrame, num_files: int) -> None:
    total_rows = len(data)
    rows_per_file = total_rows // num_files
    remaining_rows = total_rows % num_files
    
    for _ in range(1, num_files + 1):
        if remaining_rows > 0:
            chunk_size = rows_per_file + 1
            remaining_rows -= 1
        else:
            chunk_size = rows_per_file
        
        chunk = data.sample(n=min(chunk_size, len(data)))
        filename = f'data_chunk_{len(os.listdir("../raw_data")) + 1}.csv'
        
        chunk.to_csv(f'../raw_data/{filename}', index=False)
        data = data.drop(chunk.index)
        
        if data.empty:
            print("DataFrame is now empty. Exiting...")
            break

In [4]:
df = pd.read_csv('../data/winequality-red.csv')

df_with_errors = add_errors(df.copy(), 200)
num_files = 10  # Specify the number of files to create
split_and_save_data(df_with_errors, num_files)

38 null values created in data
DataFrame is now empty. Exiting...
