In [1]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('/total/data_1.csv')

# 去除有缺失值的行
df_clean = df.dropna()

# 保存为新的CSV文件
# df_clean.to_csv('//hy-tmp/data_csv', index=False)

In [2]:
df.shape

(42372, 1035)

In [3]:
# 分离特征和标签（前1034列是特征，最后一列是标签）
features = df.iloc[:, :1034]  # 前1034列
label = df.iloc[:, 1034:]     # 最后一列

In [4]:
# 1. 删除缺失值超过一半的行（基于特征列判断）
half_features = 1034 // 2
mask = features.isnull().sum(axis=1) <= half_features  # 标记有效行
features_cleaned = features[mask].copy()  # 保留有效行
label_cleaned = label[mask].copy()        # 对应保留的标签

print(f"删除行数: {len(features) - len(features_cleaned)}")

# 2. 用所在行的均值填充剩余空值
row_means = features_cleaned.mean(axis=1, skipna=True)
features_filled = features_cleaned.T.fillna(row_means).T  # 先转置再填充，最后转回

# 3. 合并处理后的特征和原始标签
processed_df = pd.concat([features_filled, label_cleaned], axis=1)

# 验证处理结果
print("\n处理后的缺失值统计（前5列示例）:")
print(features_filled.iloc[:, :5].isnull().sum())

删除行数: 11184

处理后的缺失值统计（前5列示例）:
2014/1/1    0
2014/1/2    0
2014/1/3    0
2014/1/4    0
2014/1/5    0
dtype: int64


In [5]:
print("处理后缺失值总数:", features_filled.isnull().sum().sum())

处理后缺失值总数: 0


In [16]:
import pandas as pd

# 读取CSV文件

df = processed_df
# 分离特征和标签（前1034列是特征，最后一列是标签）
features = df.iloc[:, :1034]  # 前1034列
label = df.iloc[:, 1034:]     # 最后一列

# 1. 计算每行均值（仅计算非缺失值）
row_means = features.mean(axis=1, skipna=True)

# 2. 用行均值填充缺失值
features_filled = features.T.fillna(row_means).T  # 转置填充后再转回

# 3. 合并处理后的特征和原始标签
processed_df = pd.concat([features_filled, label], axis=1)

# 验证处理结果
print("处理前缺失值总数:", features.isnull().sum().sum())
print("处理后缺失值总数:", features_filled.isnull().sum().sum())

# 显示处理示例
print("\n示例数据（前2行）：")
print(processed_df.head(2))

# 可选：保存处理后的数据
# processed_df.to_csv('/hy-tmp/data_1_processed.csv', index=False)

处理前缺失值总数: 32293
处理后缺失值总数: 0

示例数据（前2行）：
    2014/1/1   2014/1/2   2014/1/3   2014/1/4   2014/1/5   2014/1/6  \
0  13.898342  13.898342  13.898342  13.898342  13.898342  13.898342   
4   2.900000   5.640000   6.990000   3.320000   3.610000   5.350000   

    2014/1/7   2014/1/8   2014/1/9  2014/1/10  ...  2016/10/23  2016/10/24  \
0  13.898342  13.898342  13.898342  13.898342  ...        8.07        8.09   
4   4.730000   3.680000   3.530000   3.420000  ...       10.22        8.47   

   2016/10/25  2016/10/26  2016/10/27  2016/10/28  2016/10/29  2016/10/30  \
0        9.53        5.48        8.75        9.30        7.54        9.16   
4        6.11        6.10        6.73        7.52       10.89        9.86   

   2016/10/31  FLAG  
0        6.74     1  
4        8.72     1  

[2 rows x 1035 columns]


In [6]:
processed_df.to_csv('/hy-tmp/data_2.csv', index=False)

In [18]:
processed_df.shape

(31188, 1035)

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.over_sampling import ADASYN


# 配置路径
# input_csv = "/TS-TCC/data_preprocessing/epilepsy/data_files/0329data.csv"
output_dir = "/hy-tmp/dataset_rate_0605_realy/"
os.makedirs(output_dir, exist_ok=True)  # 确保输出目录存在

# 读取数据
data = pd.read_csv('/total/data_2.csv')
y = data.iloc[:, -1]
x = data.iloc[:, :-1]

# 数据预处理
x = x.to_numpy()
y = y.to_numpy()  # 假设原始标签从1开始，调整为0开始
# y = (y != 0).astype(int)  # 将所有非0类别合并为1（二分类问题）

# 归一化
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

# 数据集划分

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_val = X_test
y_val = y_test

X_train_2d = X_train.reshape(X_train.shape[0], -1)
ada = ADASYN(sampling_strategy='minority', random_state=42)
X_train, y_train = ada.fit_resample(X_train_2d, y_train)
X_train = X_train.reshape(X_train.shape[0], 1034)
X_train = X_train
y_train = y_train
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# 保存为.pt文件
def save_torch_data(X, y, filename):
    dat_dict = {
        "samples": torch.from_numpy(X).unsqueeze(1).float(),  # 添加通道维度并转为float
        "labels": torch.from_numpy(y).long()                  # 标签转为long
    }
    torch.save(dat_dict, os.path.join(output_dir, filename))

save_torch_data(X_train, y_train, "train.pt")
save_torch_data(X_val, y_val, "val.pt")
save_torch_data(X_test, y_test, "test.pt")

# 生成描述文件
def get_dataset_info(X, y, name):
    return {
        "name": name,
        "samples": len(X),
        "features": X.shape[1],
        "classes": dict(Counter(y)),
        "data_shape": f"{X.shape[0]} samples × {X.shape[1]} features"
    }

info = {
    "dataset": "Epilepsy Classification",
    # "source": input_csv,
    "preprocessing": "MinMaxScaler normalization, binary classification (0 vs non-0)",
    "train": get_dataset_info(X_train, y_train, "train"),
    "val": get_dataset_info(X_val, y_val, "val"),
    "test": get_dataset_info(X_test, y_test, "test")
}

# 写入txt文件
with open(os.path.join(output_dir, "dataset_info.txt"), "w") as f:
    f.write("=== Dataset Information ===\n")
    # f.write(f"Source CSV: {info['source']}\n")
    f.write(f"Preprocessing: {info['preprocessing']}\n\n")
    
    for split in ["train", "val", "test"]:
        data = info[split]
        f.write(f"--- {data['name'].upper()} SET ---\n")
        f.write(f"Samples: {data['samples']}\n")
        f.write(f"Shape: {data['data_shape']}\n")
        f.write(f"Classes: {data['classes']}\n\n")

    f.write(f"Total samples: {len(x)}\n")
    f.write(f"Original features: {x.shape[1]}\n")
    f.write(f"Final tensor shape: [samples, 1, features] (added channel dim)\n")

print(f"Data saved to {output_dir}")
print(f"Dataset info written to {os.path.join(output_dir, 'dataset_info.txt')}")

Data saved to /hy-tmp/dataset_rate_0605_realy/
Dataset info written to /hy-tmp/dataset_rate_0605_realy/dataset_info.txt


In [4]:
import pandas as pd
import numpy as np

# 读取原始数据
df = pd.read_csv("/total/data_1.csv")  # 替换为你的路径

# Step 1: 删除缺失值超过30%的样本
threshold = int(0.3 * 1034)
df_cleaned = df[df.iloc[:, :-1].isnull().sum(axis=1) <= threshold].copy()

# Step 2: 对剩余缺失值，按每户家庭的平均值填充
df_cleaned.iloc[:, :-1] = df_cleaned.iloc[:, :-1].apply(lambda row: row.fillna(row.mean()), axis=1)

# Step 3: 删除用电全为0的样本
df_cleaned = df_cleaned[~(df_cleaned.iloc[:, :-1] == 0).all(axis=1)]

# Step 4: 对每户用电进行z-score标准化（按行标准化）
usage_data = df_cleaned.iloc[:, :-1]
usage_normalized = (usage_data - usage_data.mean(axis=1).values[:, None]) / usage_data.std(axis=1).values[:, None]
df_cleaned.iloc[:, :-1] = usage_normalized

# Step 5: 保存处理后的数据
df_cleaned.to_csv("/total/cleaned_data.csv", index=False)


In [1]:
# 采取ADASYN方法处理类不平衡的问题
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN

# 配置路径
# input_csv = "/TS-TCC/data_preprocessing/epilepsy/data_files/0329data.csv"
output_dir = "/hy-tmp/0701_dataset/"
os.makedirs(output_dir, exist_ok=True)  # 确保输出目录存在

# 读取数据
data = pd.read_csv('/total/data_1_knnimpute.csv')
y = data.iloc[:, -1]
x = data.iloc[:, :-1]

# 数据预处理
x = x.to_numpy()
y = y.to_numpy()  # 假设原始标签从1开始，调整为0开始
# y = (y != 0).astype(int)  # 将所有非0类别合并为1（二分类问题）

# 归一化
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

# 数据集划分

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_val = X_test
y_val = y_test

X_train_2d = X_train.reshape(X_train.shape[0], -1)

smote_enn = SMOTEENN(sampling_strategy='auto', random_state=42)
ada = ADASYN(sampling_strategy='minority', random_state=42)
X_train, y_train = smote_enn.fit_resample(X_train_2d, y_train)
X_train = X_train.reshape(X_train.shape[0], 1034)


# 保存为.pt文件
def save_torch_data(X, y, filename):
    dat_dict = {
        "samples": torch.from_numpy(X).unsqueeze(1).float(),  # 添加通道维度并转为float
        "labels": torch.from_numpy(y).long()                  # 标签转为long
    }
    torch.save(dat_dict, os.path.join(output_dir, filename))

save_torch_data(X_train, y_train, "train.pt")
save_torch_data(X_val, y_val, "val.pt")
save_torch_data(X_test, y_test, "test.pt")

# 生成描述文件
def get_dataset_info(X, y, name):
    return {
        "name": name,
        "samples": len(X),
        "features": X.shape[1],
        "classes": dict(Counter(y)),
        "data_shape": f"{X.shape[0]} samples × {X.shape[1]} features"
    }

info = {
    "dataset": "Epilepsy Classification",
    # "source": input_csv,
    "preprocessing": "MinMaxScaler normalization, binary classification (0 vs non-0)",
    "train": get_dataset_info(X_train, y_train, "train"),
    "val": get_dataset_info(X_val, y_val, "val"),
    "test": get_dataset_info(X_test, y_test, "test")
}

# 写入txt文件
with open(os.path.join(output_dir, "dataset_info.txt"), "w") as f:
    f.write("=== Dataset Information ===\n")
    f.write(f"描述: 读取数据_归一化_划分数据集_smote类不平衡处理\n")
    # f.write(f"Source CSV: {info['source']}\n")
    f.write(f"Preprocessing: {info['preprocessing']}\n\n")
    
    for split in ["train", "val", "test"]:
        data = info[split]
        f.write(f"--- {data['name'].upper()} SET ---\n")
        f.write(f"Samples: {data['samples']}\n")
        f.write(f"Shape: {data['data_shape']}\n")
        f.write(f"Classes: {data['classes']}\n\n")

    f.write(f"Total samples: {len(x)}\n")
    f.write(f"Original features: {x.shape[1]}\n")
    f.write(f"Final tensor shape: [samples, 1, features] (added channel dim)\n")

print(f"Data saved to {output_dir}")
print(f"Dataset info written to {os.path.join(output_dir, 'dataset_info.txt')}")

Data saved to /hy-tmp/0701_dataset/
Dataset info written to /hy-tmp/0701_dataset/dataset_info.txt


In [None]:
# 采取ADASYN方法处理类不平衡的问题
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
# from imblearn.combine import SMOTETomek
from tsfresh.utilities.dataframe_functions import impute

from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN

# 配置路径
# input_csv = "/TS-TCC/data_preprocessing/epilepsy/data_files/0329data.csv"
# output_dir = "/hy-tmp/0626_dataset_2/"
# os.makedirs(output_dir, exist_ok=True)  # 确保输出目录存在

# 读取数据
data = pd.read_csv('/total/data_1.csv')
y = data.iloc[:, -1].to_numpy()  # 假设原始标签从1开始，调整为0开始
X = data.iloc[:, :-1]

# 数据预处理
# ==== 删除缺失值过多的样本 ====
X["missing_count"] = X.isnull().sum(axis=1)
mask = X["missing_count"] <= 500
X = X[mask].drop(columns=["missing_count"])
y = y[mask.to_numpy()]
print(f"[缺失值过滤后剩余多少条数据] shape: {X.shape}")

# ==== 可选使用 KNN 插补缺失值 ====

X = KNNImputer(n_neighbors=5).fit_transform(X)
print("[KNN 插补完成]")

# ==== 保存处理后的数据 ====
output_df = pd.DataFrame(X)
output_df['label'] = y
output_path = '/total/data_1_knnimpute.csv'
output_df.to_csv(output_path, index=False)
print(f"[保存完成] 路径: {output_path}")

[缺失值过滤后剩余多少条数据] shape: (31036, 1034)
[KNN 插补完成]
[保存完成] 路径: /total/data_1_knnimpute.csv


In [3]:
# # 归一化
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
# from imblearn.combine import SMOTETomek
from tsfresh.utilities.dataframe_functions import impute

from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features, select_features
import pandas as pd
# 读取数据
data = pd.read_csv('/hy-tmp/data0702/data_1_knnimpute.csv')
y = data.iloc[:, -1].to_numpy()  # 假设原始标签从1开始，调整为0开始
X = data.iloc[:, :-1]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
name = '数据描述:0712读取knn填充后的数据集,不做类不平衡处理'
# 数据集划分
output_dir = "/hy-tmp/0712_realdata/"
os.makedirs(output_dir, exist_ok=True)  # 确保输出目录存在


# ada = SMOTEENN(sampling_strategy='auto', random_state=42)
# X_2d = X.reshape(X.shape[0], -1)
# X, y = ada.fit_resample(X_2d, y)
# X = X.reshape(X.shape[0], 1034)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_val ,y_val = X_test, y_test

# X_train_2d = X_train.reshape(X_train.shape[0], -1)

# # smote_enn = SMOTEENN(sampling_strategy='auto', random_state=42)
# ada = ADASYN(sampling_strategy='minority', random_state=42)
# X_train, y_train = ada.fit_resample(X_train_2d, y_train)
# X_train = X_train.reshape(X_train.shape[0], 1034)
# X_train = X_train
# y_train = y_train
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


# print("[FRESH] 正在提取时序特征...")
# df_long = pd.DataFrame(X_train)
# df_long['id'] = df_long.index
# df_long = df_long.melt(id_vars='id', var_name='time', value_name='value')

# features_extracted = extract_features(df_long, column_id='id', column_sort='time', n_jobs=3)
# impute(features_extracted, show_warnings=False)
# selected_features = select_features(features_extracted, y_train)

# X_train_final = selected_features.to_numpy()
# print(f"[FRESH完成] 最终特征维度: {X_train_final.shape[1]}")




# 保存为.pt文件
def save_torch_data(X, y, filename):
    dat_dict = {
        "samples": torch.from_numpy(X).unsqueeze(1).float(),  # 添加通道维度并转为float
        "labels": torch.from_numpy(y).long()                  # 标签转为long
    }
    torch.save(dat_dict, os.path.join(output_dir, filename))

save_torch_data(X_train, y_train, "train.pt")
save_torch_data(X_val, y_val, "val.pt")
save_torch_data(X_test, y_test, "test.pt")

# 生成描述文件
def get_dataset_info(X, y, name):
    return {
        "name": name,
        "samples": len(X),
        "features": X.shape[1],
        "classes": dict(Counter(y)),
        "data_shape": f"{X.shape[0]} samples X {X.shape[1]} features"
    }

info = {
    "dataset": "Epilepsy Classification",
    # "source": input_csv,
    "preprocessing": "MinMaxScaler normalization, binary classification (0 vs non-0)",
    "train": get_dataset_info(X_train, y_train, "train"),
    "val": get_dataset_info(X_val, y_val, "val"),
    "test": get_dataset_info(X_test, y_test, "test")
}


# 写入txt文件
with open(os.path.join(output_dir, "dataset_info.txt"), "w") as f:
    f.write("=== Dataset Information ===\n")
    f.write(f"{name}\n")
    # f.write(f"Source CSV: {info['source']}\n")
    f.write(f"Preprocessing: {info['preprocessing']}\n\n")
    
    for split in ["train", "val", "test"]:
        data = info[split]
        f.write(f"--- {data['name'].upper()} SET ---\n")
        f.write(f"Samples: {data['samples']}\n")
        f.write(f"Shape: {data['data_shape']}\n")
        f.write(f"Classes: {data['classes']}\n\n")

    f.write(f"Total samples: {len(X)}\n")
    f.write(f"Original features: {X.shape[1]}\n")
    f.write(f"Final tensor shape: [samples, 1, features] (added channel dim)\n")

print(f"Data saved to {output_dir}")
print(f"Dataset info written to {os.path.join(output_dir, 'dataset_info.txt')}")

Data saved to /hy-tmp/0712_realdata/
Dataset info written to /hy-tmp/0712_realdata/dataset_info.txt


In [None]:


save_torch_data(X_train_final, y_train, "train.pt")
save_torch_data(X_val, y_val, "val.pt")
save_torch_data(X_test, y_test, "test.pt")


info = {
    "dataset": "Epilepsy Classification",
    # "source": input_csv,
    "preprocessing": "MinMaxScaler normalization, binary classification (0 vs non-0)",
    "train": get_dataset_info(X_train_final, y_train, "train"),
    "val": get_dataset_info(X_val, y_val, "val"),
    "test": get_dataset_info(X_test, y_test, "test")
}
output_dir = "/hy-tmp/0629dataset_featurn/"

# 写入txt文件
with open(os.path.join(output_dir, "dataset_info.txt"), "w") as f:
    f.write("=== Dataset Information ===\n")
    # f.write(f"Source CSV: {info['source']}\n")
    f.write(f"Preprocessing: {info['preprocessing']}\n\n")
    
    for split in ["train", "val", "test"]:
        data = info[split]
        f.write(f"--- {data['name'].upper()} SET ---\n")
        f.write(f"Samples: {data['samples']}\n")
        f.write(f"Shape: {data['data_shape']}\n")
        f.write(f"Classes: {data['classes']}\n\n")

    f.write(f"Total samples: {len(X)}\n")
    f.write(f"Original features: {X.shape[1]}\n")
    f.write(f"Final tensor shape: [samples, 1, features] (added channel dim)\n")

print(f"Data saved to {output_dir}")
print(f"Dataset info written to {os.path.join(output_dir, 'dataset_info.txt')}")