In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [9]:
# List all CSV files in the directory
csv_files = [f for f in os.listdir('class_imbalance') if f.endswith('.csv')]

# Dictionary to store missing values count for each file
missing_values_counts = {}

# Iterate through each CSV file
for csv_file in csv_files:
    file_path = os.path.join('class_imbalance', csv_file)
    df = pd.read_csv(file_path)
    missing_values_counts[csv_file] = df.isnull().sum().sum()

# Display the missing values count for each file
for file, count in missing_values_counts.items():
    print(f"{file}: {count} missing values")


dataset_1000_hypothyroid.csv: 6064 missing values
dataset_1002_ipums_la_98-small.csv: 32427 missing values
dataset_1004_synthetic_control.csv: 0 missing values
dataset_1013_analcatdata_challenger.csv: 0 missing values
dataset_1014_analcatdata_dmft.csv: 0 missing values
dataset_1016_vowel.csv: 0 missing values
dataset_1018_ipums_la_99-small.csv: 34843 missing values
dataset_1020_mfeat-karhunen.csv: 0 missing values
dataset_1021_page-blocks.csv: 0 missing values
dataset_1022_mfeat-pixel.csv: 0 missing values
dataset_1023_soybean.csv: 2337 missing values
dataset_1039_hiva_agnostic.csv: 0 missing values
dataset_1045_kc1-top5.csv: 0 missing values
dataset_1049_pc4.csv: 0 missing values
dataset_1050_pc3.csv: 0 missing values
dataset_1056_mc1.csv: 0 missing values
dataset_1059_ar1.csv: 0 missing values
dataset_1061_ar4.csv: 0 missing values
dataset_1064_ar6.csv: 0 missing values
dataset_1065_kc3.csv: 0 missing values
dataset_311_oil_spill.csv: 0 missing values
dataset_312_scene.csv: 0 missing

In [10]:
files_with_missing_values = {file: count for file, count in missing_values_counts.items() if count > 0}
files_with_missing_values

{'dataset_1000_hypothyroid.csv': 6064,
 'dataset_1002_ipums_la_98-small.csv': 32427,
 'dataset_1018_ipums_la_99-small.csv': 34843,
 'dataset_1023_soybean.csv': 2337,
 'dataset_38_sick.csv': 6064,
 'dataset_757_meta.csv': 504,
 'dataset_940_water-treatment.csv': 542,
 'dataset_966_analcatdata_halloffame.csv': 20,
 'dataset_968_analcatdata_birthday.csv': 30,
 'dataset_984_analcatdata_draft.csv': 1}

In [11]:
categorical_columns = {}

# Iterate through each CSV file
for csv_file in csv_files:
    file_path = os.path.join('class_imbalance', csv_file)
    df = pd.read_csv(file_path)
    
    # Drop columns that are entirely NaN
    df = df.dropna(axis=1, how='all')
    
    # Identify categorical columns
    cat_cols = []
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype == 'int64':
            cat_cols.append(col)
        elif df[col].dtype == 'float64' and ((df[col].dropna() == df[col].dropna().astype(int)).all()):
            cat_cols.append(col)
    
    # Save the categorical columns in the dictionary
    categorical_columns[csv_file] = cat_cols

categorical_columns

{'dataset_1000_hypothyroid.csv': ['age',
  'sex',
  'on thyroxine',
  'query on thyroxine',
  'on antithyroid medication',
  'sick',
  'pregnant',
  'thyroid surgery',
  'I131 treatment',
  'query hypothyroid',
  'query hyperthyroid',
  'lithium',
  'goitre',
  'tumor',
  'hypopituitary',
  'psych',
  'TSH measured',
  'T3 measured',
  'TT4 measured',
  'T4U measured',
  'FTI measured',
  'TBG measured',
  'referral source',
  'binaryClass'],
 'dataset_1002_ipums_la_98-small.csv': ['year',
  'gq',
  'gqtypeg',
  'farm',
  'ownershg',
  'value',
  'rent',
  'ftotinc',
  'nfams',
  'ncouples',
  'nmothers',
  'nfathers',
  'momloc',
  'stepmom',
  'momrule',
  'poploc',
  'steppop',
  'poprule',
  'sploc',
  'sprule',
  'famsize',
  'nchild',
  'nchlt5',
  'famunit',
  'eldch',
  'yngch',
  'nsibs',
  'relateg',
  'age',
  'sex',
  'raceg',
  'marst',
  'chborn',
  'school',
  'educrec',
  'schltype',
  'empstatg',
  'labforce',
  'occscore',
  'sei',
  'classwkg',
  'wkswork2',
  'hrswo

In [12]:
# Function to encode string columns to integers
def encode_strings(df):
    label_encoders = {}
    for col in df.columns:
        if df[col].dtype == 'object':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le
    return df, label_encoders

# Apply KNN imputation to each file
imputed_data = {}
for csv_file in csv_files:
    file_path = os.path.join('class_imbalance', csv_file)
    df = pd.read_csv(file_path)
    
    # Drop columns that are entirely NaN
    df = df.dropna(axis=1, how='all')
    
    # Encode string columns to integers
    df, label_encoders = encode_strings(df)
    
    # Apply KNN imputation
    imputer = KNNImputer(n_neighbors=5)
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    
    # Save the imputed dataframe
    imputed_data[csv_file] = df_imputed

# Display the imputed data for verification
imputed_data

{'dataset_1000_hypothyroid.csv':        age  sex  on thyroxine  query on thyroxine  on antithyroid medication  \
 0     41.0  0.0           0.0                 0.0                        0.0   
 1     23.0  0.0           0.0                 0.0                        0.0   
 2     46.0  1.0           0.0                 0.0                        0.0   
 3     70.0  0.0           1.0                 0.0                        0.0   
 4     70.0  0.0           0.0                 0.0                        0.0   
 ...    ...  ...           ...                 ...                        ...   
 3767  30.0  0.0           0.0                 0.0                        0.0   
 3768  68.0  0.0           0.0                 0.0                        0.0   
 3769  74.0  0.0           0.0                 0.0                        0.0   
 3770  72.0  1.0           0.0                 0.0                        0.0   
 3771  64.0  0.0           0.0                 0.0                        0.0

In [13]:
for file in categorical_columns.keys():
    cat_cols = categorical_columns[file]
    
    for col in cat_cols:
        imputed_data[file][col] = imputed_data[file][col].round().astype(int)
        
imputed_data

{'dataset_1000_hypothyroid.csv':       age  sex  on thyroxine  query on thyroxine  on antithyroid medication  \
 0      41    0             0                   0                          0   
 1      23    0             0                   0                          0   
 2      46    1             0                   0                          0   
 3      70    0             1                   0                          0   
 4      70    0             0                   0                          0   
 ...   ...  ...           ...                 ...                        ...   
 3767   30    0             0                   0                          0   
 3768   68    0             0                   0                          0   
 3769   74    0             0                   0                          0   
 3770   72    1             0                   0                          0   
 3771   64    0             0                   0                          0   
 
      

In [14]:
# Create the folder if it doesn't exist
output_folder = 'fixed_datasets'
os.makedirs(output_folder, exist_ok=True)

# Save each DataFrame to a CSV file
for file_name, data in imputed_data.items():
    output_path = os.path.join(output_folder, file_name)
    data.to_csv(output_path, index=False)

print(f"All files have been saved to the '{output_folder}' folder.")

All files have been saved to the 'fixed_datasets' folder.
