In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import random

In [2]:
# Function to select [size] random rows and set a random feature on them to null
def add_errors(data: pd.DataFrame, size: int) -> pd.DataFrame:
    indexes = np.random.randint(0, len(data) - 1, size)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)    
        columns = ['alcohol', 'pH', 'chlorides'] 
        for i in indexes:
            column = np.random.choice(columns)
            error_type = np.random.choice(['missing', 'out_of_range'])  
            
            if error_type == 'missing':
                data.loc[i, column] = np.NaN
            
            elif error_type == 'out_of_range': 
                if column == 'alcohol':
                    data.loc[i, column] = np.random.uniform(15, 40)
                elif column == 'pH':
                    data.loc[i, column] = np.random.uniform(8, 10)
                elif column == 'chlorides':
                    data.loc[i, column] = np.random.uniform(2, 4)      

    print(data.isnull().sum().sum(), 'null values created in data')
    return data


In [3]:
# Function to introduce missing columns in randomly selected files
def introduce_missing_columns(num_files: int) -> None:
    raw_data_files = os.listdir("../raw_data")
    files_to_modify = np.random.choice(raw_data_files, num_files, replace=False)
    
    columns = ['fixed acidity', 'volatile acidity']
    
    for file in files_to_modify:
        df = pd.read_csv(f'../raw_data/{file}')
        columns_to_drop = random.sample(columns, random.choice([1,2]))
        print(f"Dropping columns {columns_to_drop} from file {file}")
        df.drop(columns_to_drop, axis=1, inplace=True)
        df.to_csv(f'../raw_data/{file}', index=False)


In [4]:
# Function to split one file into three smaller files
def split_one_file_into_four() -> None:
    raw_data_files = os.listdir("../raw_data")
    files = np.random.choice(raw_data_files, 2)
    files_count = len(raw_data_files)
    for file_to_split in files:
        df = pd.read_csv(f'../raw_data/{file_to_split}')
        chunks = np.array_split(df, 4)
        print(f"Splitting file {file_to_split} into 4 files.")
        for _, chunk in enumerate(chunks):
            chunk.to_csv(f'../raw_data/data_chunk_{files_count + 1}.csv', index=False)
            files_count+=1

In [5]:
def mix_columns(num_files:int) -> None:
    raw_files = os.listdir("../raw_data")
    files_to_modify = np.random.choice(raw_files, num_files, replace=False)
    column = 'fixed acidity'
    for file in files_to_modify:
        df = pd.read_csv(f'../raw_data/{file}')
        colum_switch = df[column]
        df.drop(column, axis=1, inplace=True)
        df[column] = colum_switch
        df.to_csv(f'../raw_data/{file}', index=False)

In [6]:
# Function to split and save data
def split_and_save_data(data: pd.DataFrame, num_files: int) -> None:
    total_rows = len(data)
    rows_per_file = total_rows // num_files
    remaining_rows = total_rows % num_files
    
    for _ in range(1, num_files + 1):
        if remaining_rows > 0:
            chunk_size = rows_per_file + 1
            remaining_rows -= 1
        else:
            chunk_size = rows_per_file
        
        chunk = data.sample(n=min(chunk_size, len(data)))
        filename = f'data_chunk_{len(os.listdir("../raw_data")) + 1}.csv'
        
        chunk.to_csv(f'../raw_data/{filename}', index=False)
        data = data.drop(chunk.index)
        
        if data.empty:
            print("DataFrame is now empty. Exiting...")
            break


In [8]:
df = pd.read_csv('../data/winequality-red.csv')
df = df.reset_index()

# Introduce NaN and out of range errors
df_with_errors = add_errors(df.copy(), 400)

num_files = 200  # Specify the number of files to create
split_and_save_data(df_with_errors, num_files)

introduce_missing_columns(20)  # Introduce missing columns in 5 random files
split_one_file_into_four()

201 null values created in data
DataFrame is now empty. Exiting...
Dropping columns ['volatile acidity'] from file data_chunk_190.csv
Dropping columns ['volatile acidity', 'fixed acidity'] from file data_chunk_33.csv
Dropping columns ['volatile acidity', 'fixed acidity'] from file data_chunk_49.csv
Dropping columns ['fixed acidity', 'volatile acidity'] from file data_chunk_102.csv
Dropping columns ['volatile acidity'] from file data_chunk_148.csv
Dropping columns ['volatile acidity', 'fixed acidity'] from file data_chunk_85.csv
Dropping columns ['fixed acidity'] from file data_chunk_3.csv
Dropping columns ['fixed acidity', 'volatile acidity'] from file data_chunk_175.csv
Dropping columns ['fixed acidity', 'volatile acidity'] from file data_chunk_70.csv
Dropping columns ['volatile acidity'] from file data_chunk_174.csv
Dropping columns ['volatile acidity'] from file data_chunk_131.csv
Dropping columns ['fixed acidity'] from file data_chunk_188.csv
Dropping columns ['fixed acidity', 'vol

  return bound(*args, **kwds)
  return bound(*args, **kwds)
