In [1]:
import csv
import pandas as pd

# 创建模拟化学实验数据
data = [
    ["id", "temperature", "yield_value"],
    [1, 25.3, 12.5],
    [2, -5.2, 8.2],    # 无效：温度<0
    [3, 30.1, 3.8],     # 无效：产率<5
    [4, 45.6, 15.7],
    [5, 90.2, 4.9],     # 无效：产率<5
    [6, 10.5, 6.2],
    [7, -2.1, 18.9],    # 无效：温度<0
    [8, 65.3, 22.1],
    [9, 28.7, 2.3],     # 无效：产率<5
    [10, 33.4, 9.8]
]

# 保存为CSV（自动创建文件）
with open('mock_data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(data)

# 清洗函数
def clean_lab_data(input_file, output_file):
    with open(input_file, 'r') as f_in:
        reader = csv.reader(f_in)
        header = next(reader)
        
        valid_data = []
        for row in reader:
            try:
                temperature = float(row[1])
                yield_value = float(row[2])
                if temperature > 0 and yield_value > 5:
                    valid_data.append(row)
            except:
                continue
        
        # 保存清洗后数据
        with open(output_file, 'w') as f_out:
            writer = csv.writer(f_out)
            writer.writerow(header)
            writer.writerows(valid_data)
        
        # 生成报告
        print(f"原始数据量: {reader.line_num} 条")
        print(f"有效数据量: {len(valid_data)} 条")
        df = pd.DataFrame(valid_data, columns=header)
        return df

# 执行清洗
clean_df = clean_lab_data('mock_data.csv', 'clean_data.csv')
print("\n清洗后数据:")
print(clean_df)

原始数据量: 11 条
有效数据量: 5 条

清洗后数据:
   id temperature yield_value
0   1        25.3        12.5
1   4        45.6        15.7
2   6        10.5         6.2
3   8        65.3        22.1
4  10        33.4         9.8
