In [1]:
import numpy as np
import pandas as pd
import os
import shutil
from sklearn.impute import KNNImputer

def process_and_save(train_path, val_path, output_dir, n_neighbors=5):
    print(f"Processing {train_path} and {val_path}")
    
    # Đọc dữ liệu train và val
    df_train = pd.read_csv(train_path)
    df_val = pd.read_csv(val_path)
    df = pd.concat([df_train, df_val], ignore_index=True)

    # Làm sạch giá trị AWS không hợp lệ
    df['AWS'] = pd.to_numeric(df['AWS'], errors='coerce')
    df.loc[df['AWS'] < 0, 'AWS'] = np.nan

    # Impute bằng KNNImputer
    impute_features = ['ROW', 'COL', 'AWS']
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = df[impute_features].copy()
    df_imputed_values = imputer.fit_transform(df_imputed)
    df['AWS'] = df_imputed_values[:, 2]  # Gán lại AWS đã được điền

    # Lưu lại dữ liệu đã impute
    df_train_processed = df.iloc[:len(df_train)]
    df_val_processed = df.iloc[len(df_train):]
    os.makedirs(output_dir, exist_ok=True)
    df_train_processed.to_csv(os.path.join(output_dir, "processed_train.csv"), index=False)
    df_val_processed.to_csv(os.path.join(output_dir, "processed_val.csv"), index=False)

    print(f"Processed train saved to {os.path.join(output_dir, 'processed_train.csv')}")
    print(f"Processed val saved to {os.path.join(output_dir, 'processed_val.csv')}")

def process_all_datasets(base_root, n_neighbors=5):
    dataset_folders = [f for f in os.listdir(base_root) if os.path.isdir(os.path.join(base_root, f))]
    for dataset in dataset_folders:
        base_path = os.path.join(base_root, dataset)
        output_dir = os.path.join("/kaggle/working", dataset)
        folds = [f for f in os.listdir(base_path) if f.startswith("fold_")]
        for fold in folds:
            train_path = os.path.join(base_path, fold, "merged_train.csv")
            val_path = os.path.join(base_path, fold, "merged_val.csv")
            if os.path.exists(train_path) and os.path.exists(val_path):
                process_and_save(train_path, val_path, os.path.join(output_dir, fold), n_neighbors=n_neighbors)

# === STEP 1: Chạy xử lý train/val ===
base_root = "/kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold"
process_all_datasets(base_root, n_neighbors=5)

# === STEP 2: Copy merged_test.csv vào đúng vị trí ===
src_dataset = '/kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold'
dst_dataset = '/kaggle/working/only-AWS-impute-KNN'
shutil.copytree(src_dataset, dst_dataset, dirs_exist_ok=True)

src_test_root = '/kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold'
for subdir in os.listdir(src_test_root):
    subdir_path = os.path.join(src_test_root, subdir)
    if os.path.isdir(subdir_path):
        for fold in os.listdir(subdir_path):
            fold_path = os.path.join(subdir_path, fold)
            if os.path.isdir(fold_path):
                test_file = os.path.join(fold_path, 'merged_test.csv')
                if os.path.exists(test_file):
                    dst_fold_path = os.path.join(dst_dataset, subdir, fold)
                    os.makedirs(dst_fold_path, exist_ok=True)
                    shutil.copy2(test_file, dst_fold_path)

# === STEP 3: Clip AWS > 10 và thống kê phân phối mưa ===
all_dfs = []
for subdir, dirs, files in os.walk(dst_dataset):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)

            df.columns = [col.upper() for col in df.columns]
            if 'AWS' in df.columns:
                df['AWS'] = pd.to_numeric(df['AWS'], errors='coerce')
                df['AWS'] = df['AWS'].clip(upper=10)
                df.to_csv(file_path, index=False)
                all_dfs.append(df)

if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df = full_df[full_df['AWS'].notna()]
    conditions = {
        'No Rain': (full_df['AWS'] == 0.0),
        'Slight Rain': (full_df['AWS'] > 0) & (full_df['AWS'] < 2.5),
        'Moderate Rain': (full_df['AWS'] >= 2.5) & (full_df['AWS'] <= 10.0),
        'Heavy Rain': (full_df['AWS'] > 10.0) & (full_df['AWS'] <= 50),
        'Intense Rain': (full_df['AWS'] > 50)
    }

    print("🌧 Rain Intensity Distribution After Clipping:")
    for label, condition in conditions.items():
        print(f"{label}: {condition.sum()} samples")
else:
    print("⚠ Không có dữ liệu nào chứa cột 'AWS'.")


Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_4/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_4/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2020-10/fold_4/processed_train.csv
Processed val saved to /kaggle/working/2020-10/fold_4/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_1/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_1/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2020-10/fold_1/processed_train.csv
Processed val saved to /kaggle/working/2020-10/fold_1/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_3/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_3/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2020-10/fold_3/processed_train.csv
Processed val saved to /kaggle/working/2020-10/fold_3/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_2/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_2/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2020-10/fold_2/processed_train.csv
Processed val saved to /kaggle/working/2020-10/fold_2/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_5/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-10/fold_5/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2020-10/fold_5/processed_train.csv
Processed val saved to /kaggle/working/2020-10/fold_5/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-04/fold_4/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-04/fold_4/merged_val.csv
Processed train saved to /kaggle/working/2020-04/fold_4/processed_train.csv
Processed val saved to /kaggle/working/2020-04/fold_4/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-04/fold_1/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-04/fold_1/merged_val.csv
Processed train saved to /kaggle/working/2020-04/fold_1/processed_train.csv
Processed val saved to /kaggle/working/2020-04/fold_1/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2020-04/fold_3/merged_train.csv and /kaggle/input/ai-ori

  return op(a, b)


Processed train saved to /kaggle/working/2019-10/fold_4/processed_train.csv
Processed val saved to /kaggle/working/2019-10/fold_4/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_1/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_1/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2019-10/fold_1/processed_train.csv
Processed val saved to /kaggle/working/2019-10/fold_1/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_3/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_3/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2019-10/fold_3/processed_train.csv
Processed val saved to /kaggle/working/2019-10/fold_3/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_2/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_2/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2019-10/fold_2/processed_train.csv
Processed val saved to /kaggle/working/2019-10/fold_2/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_5/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-10/fold_5/merged_val.csv


  return op(a, b)


Processed train saved to /kaggle/working/2019-10/fold_5/processed_train.csv
Processed val saved to /kaggle/working/2019-10/fold_5/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-04/fold_4/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-04/fold_4/merged_val.csv
Processed train saved to /kaggle/working/2019-04/fold_4/processed_train.csv
Processed val saved to /kaggle/working/2019-04/fold_4/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-04/fold_1/merged_train.csv and /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-04/fold_1/merged_val.csv
Processed train saved to /kaggle/working/2019-04/fold_1/processed_train.csv
Processed val saved to /kaggle/working/2019-04/fold_1/processed_val.csv
Processing /kaggle/input/ai-original-data-only-aws/ai-original-dataset-k-fold/2019-04/fold_3/merged_train.csv and /kaggle/input/ai-ori

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


🌧 Rain Intensity Distribution After Clipping:
No Rain: 2105997 samples
Slight Rain: 230842 samples
Moderate Rain: 83343 samples
Heavy Rain: 0 samples
Intense Rain: 0 samples
