In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
# List all CSV files in the directory
csv_files = [f for f in os.listdir('class_imbalance') if f.endswith('.csv')]

# Dictionary to store missing values count for each file
missing_values_counts = {}

# Iterate through each CSV file
for csv_file in csv_files:
    file_path = os.path.join('class_imbalance', csv_file)
    df = pd.read_csv(file_path)
    missing_values_counts[csv_file] = df.isnull().sum().sum()

# Display the missing values count for each file
for file, count in missing_values_counts.items():
    print(f"{file}: {count} missing values")


In [3]:
files_with_missing_values = {file: count for file, count in missing_values_counts.items() if count > 0}
files_with_missing_values

{}

In [4]:
categorical_columns = {}

# Iterate through each CSV file
for csv_file in csv_files:
    file_path = os.path.join('class_imbalance', csv_file)
    df = pd.read_csv(file_path)
    
    # Drop columns that are entirely NaN
    df = df.dropna(axis=1, how='all')
    
    # Identify categorical columns
    cat_cols = []
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype == 'int64':
            cat_cols.append(col)
        elif df[col].dtype == 'float64' and ((df[col].dropna() == df[col].dropna().astype(int)).all()):
            cat_cols.append(col)
    
    # Save the categorical columns in the dictionary
    categorical_columns[csv_file] = cat_cols

categorical_columns

{}

In [5]:
# Function to encode string columns to integers
def encode_strings(df):
    label_encoders = {}
    for col in df.columns:
        if df[col].dtype == 'object':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le
    return df, label_encoders

# Apply KNN imputation to each file
imputed_data = {}
for csv_file in csv_files:
    file_path = os.path.join('class_imbalance', csv_file)
    df = pd.read_csv(file_path)
    
    # Drop columns that are entirely NaN
    df = df.dropna(axis=1, how='all')
    
    # Encode string columns to integers
    df, label_encoders = encode_strings(df)
    
    # Apply KNN imputation
    imputer = KNNImputer(n_neighbors=5)
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    
    # Save the imputed dataframe
    imputed_data[csv_file] = df_imputed

# Display the imputed data for verification
imputed_data

{}

In [6]:
for file in categorical_columns.keys():
    cat_cols = categorical_columns[file]
    
    for col in cat_cols:
        imputed_data[file][col] = imputed_data[file][col].round().astype(int)
        
imputed_data

{}

In [7]:
# Create the folder if it doesn't exist
output_folder = 'fixed_datasets'
os.makedirs(output_folder, exist_ok=True)

# Save each DataFrame to a CSV file
for file_name, data in imputed_data.items():
    output_path = os.path.join(output_folder, file_name)
    data.to_csv(output_path, index=False)

print(f"All files have been saved to the '{output_folder}' folder.")

All files have been saved to the 'fixed_datasets' folder.
