In [53]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

# 加载文件路径
tasks_file = 'D:/Desktop/tasks.csv'
cost_file = 'D:/Desktop/cost.csv'
suppliers_file = 'D:/Desktop/suppliers.csv'

# 读取文件
tasks_df = pd.read_csv(tasks_file)
cost_df = pd.read_csv(cost_file)
suppliers_df = pd.read_csv(suppliers_file)

# 定义缺失值处理函数
def handle_missing_values(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:  # 数值型列
            df[column] = df[column].fillna(df[column].mean())
        elif df[column].dtype == 'object':  # 分类型列
            df[column] = df[column].fillna(df[column].mode()[0])
    return df

# 处理缺失值
tasks_df = handle_missing_values(tasks_df)
cost_df = handle_missing_values(cost_df)
suppliers_df = handle_missing_values(suppliers_df)

# 对 Tasks 数据集进行标准化，并保留原始索引和任务标识符
tasks_id = tasks_df['Task ID']  # 提取任务标识符
tasks_numeric = tasks_df.select_dtypes(include=['float64', 'int64'])  # 提取数值型特征

# 特征选择，删除低方差特征
selector = VarianceThreshold(threshold=0.1)  # 假设阀值是0.1
selected_features = selector.fit_transform(tasks_numeric)
selected_columns = tasks_numeric.columns[selector.get_support()]

# 将特征选择后的数据与任务ID结合
tasks_scaled = pd.DataFrame(
    MinMaxScaler().fit_transform(selected_features),
    columns=selected_columns
)
tasks_scaled.insert(0, 'Task ID', tasks_id)

# 对 Suppliers 数据集进行标准化
suppliers_numeric = suppliers_df.iloc[:, 1:]  # 排除第一列 'Features'
suppliers_scaled = pd.DataFrame(
    MinMaxScaler().fit_transform(suppliers_numeric),
    columns=suppliers_numeric.columns
)
suppliers_scaled.insert(0, 'Features', suppliers_df['Features'])  # 添加 Features 列

# 基于 Cost 数据集计算每个任务的供应商成本表现
cost_df_grouped = cost_df.groupby(['Task ID', 'Supplier ID'], group_keys=False)['Cost'].mean().reset_index()
cost_df_grouped = cost_df_grouped.sort_values(by=['Task ID', 'Cost'])

# 对于每个任务，只保留成本最低的前80%供应商
def filter_top_suppliers(group, top_percent=0.8):
    return group.head(int(len(group) * top_percent))

filtered_suppliers = cost_df_grouped.groupby('Task ID', group_keys=False).apply(filter_top_suppliers, include_groups=False).reset_index(drop=True)
filtered_suppliers.to_csv("D:/Desktop/filtered_task_supplier_performance.csv", index=False)

# 保存更新后的数据集
tasks_scaled.to_csv("D:/Desktop/scaled_tasks.csv", index=False)
suppliers_scaled.to_csv("D:/Desktop/scaled_suppliers.csv", index=False)

print("处理后的数据集已保存为以下文件：")
print("- Scaled Tasks Dataset: scaled_tasks.csv")
print("- Scaled Suppliers Dataset: scaled_suppliers.csv")
print("- Filtered Task-Supplier Performance: filtered_task_supplier_performance.csv")



处理后的数据集已保存为以下文件：
- Scaled Tasks Dataset: scaled_tasks.csv
- Scaled Suppliers Dataset: scaled_suppliers.csv
- Filtered Task-Supplier Performance: filtered_task_supplier_performance.csv
