In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from copy import deepcopy
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 数据读取

In [None]:
# 数据读取
columns_to_int16 = ['日期', '时间', '生产线编号', '物料推送气缸状态', '物料推送数', '物料待抓取数', '放置容器数', '容器上传检测数',
                    '填装检测数', '填装定位器状态', '物料抓取数', '填装旋转数', '填装下降数', '填装数', '加盖检测数', 
                    '加盖定位数', '推盖数', '加盖下降数', '加盖数', '拧盖检测数', '拧盖定位数', '拧盖下降数', '拧盖旋转数', 
                    '拧盖数', '合格数', '不合格数','机器状态']
columns_to_int32 = ['0_Duration', '1_Duration', '2_Duration', '3_Duration', '4_Duration', '5_Duration', 
                    '6_Duration', '7_Duration', '8_Duration', '9_Duration', '10_Duration']

def transfer_data_int(df):
    df[columns_to_int16] = df[columns_to_int16].astype('Int16')
    df[columns_to_int32] = df[columns_to_int32].astype('Int32')
    return df

def load_and_convert(file_paths):
    """批量读取 CSV 并转换类型，返回字典"""
    return {fp.stem: transfer_data_int(pd.read_csv(fp)) for fp in file_paths}

# 处理结果
processed_files = list(Path('temp_data/train').glob('M*.csv'))
train_data = load_and_convert(processed_files[:8])
dev_data = load_and_convert(processed_files[8:])

# 预测数据
predicted_files = list(Path('temp_data/test').glob('M*.csv'))
test_data = load_and_convert(predicted_files)

In [None]:
# 特征筛选
feature_columns = ['物料推送气缸状态', '物料推送数', '物料待抓取数', '放置容器数', '容器上传检测数', '填装检测数',
       '填装定位器状态', '物料抓取数', '填装旋转数', '填装下降数', '加盖检测数', '加盖定位数', '推盖数',
       '加盖下降数', '拧盖检测数', '拧盖定位数', '拧盖下降数', '拧盖旋转数', '拧盖数', #'合格数','不合格数',
       '0_Duration', '1_Duration', '2_Duration', '3_Duration',
       '4_Duration', '5_Duration', '6_Duration', '7_Duration', '8_Duration',
       '9_Duration', '10_Duration']

In [None]:
train_data = pd.concat(train_data.values(), ignore_index=True)
dev_data = pd.concat(dev_data.values(), ignore_index=True)

X_train, y_train = train_data[feature_columns], train_data['机器状态']
X_dev, y_dev = dev_data[feature_columns], dev_data['机器状态']

X_test_1, y_test_1 = test_data["M201"][feature_columns],test_data["M201"]['机器状态']
X_test_2, y_test_2 = test_data["M202"][feature_columns],test_data["M202"]['机器状态']


## 模型评估指标

In [None]:
from sklearn.metrics import accuracy_score, recall_score
# 故障准确率
def Fault_Accuracy(y_,y_pred_):
    y_np = np.array(y_).astype(int)
    y_pred_np = np.array(y_pred_).astype(int)
    fault_index = np.where(y_np!= 0)[0]
    
    fault_accuracy = accuracy_score(y_np[fault_index],y_pred_np[fault_index])
    return fault_accuracy

# 报警准确率（不管报警内容是否正确）
def Warning_Accuracy(y_,y_pred_):
    y_np = np.array(y_).astype(int)
    y_pred_np = np.array(y_pred_).astype(int)
    
    y_np[y_np != 0] = 1
    y_pred_np[y_pred_np != 0] = 1

    warning_accuracy = accuracy_score(y_np,y_pred_np)
    return warning_accuracy

def Recall(y_,y_pred_):
    fault_labels = list(range(10))
    y_np = np.array(y_).astype(int)
    y_pred_np = np.array(y_pred_).astype(int)

    recalls = recall_score(y_np, y_pred_np, average=None,labels=fault_labels)
    print("单个类别的召回率:")
    labeled_recalls = {int(label): float(recall) for label, recall in zip(fault_labels, recalls)}
    # 打印结果
    for label, recall in labeled_recalls.items():
        print('%d : %.4f '%(label,recall))

## 采样策略 （SMOTE-Tomek 方法）

In [None]:
from collections import Counter  
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

# smote重采样
def smote_resample(X_train,y_train,sampling_strategy):

    # 检查原始数据集的类别分布
    original_counter = Counter(y_train)
    print("Original dataset shape", original_counter)

    # 定义SMOTE对象，并应用上述采样策略
    smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=sampling_strategy)

    # 对训练数据进行SMOTE过采样  
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)  

    # 检查SMOTE过采样后的类别分布  
    resampled_counter = Counter(y_resampled)
    print("SMOTE resampled dataset shape", resampled_counter)
    return X_resampled, y_resampled

# 欠采样
def undersample(X, y):
    # 检查原始数据集的类别分布
    original_counter = Counter(y)
    print("Original dataset shape", original_counter)

    # 分离多数类和少数类
    X_majority = X[y == 0]
    y_majority = y[y == 0]
    X_minority = X[y != 0]
    y_minority = y[y != 0]

    # 计算需要从多数类中删除的样本数量
    num_to_remove = int(len(X_majority)/10)

    # 随机选择要删除的样本
    indices_to_remove = np.random.choice(len(X_majority), num_to_remove, replace=False)

    # 选择样本
    X_majority = X_majority.iloc[indices_to_remove]
    y_majority = y_majority.iloc[indices_to_remove]

    # 合并数据集
    X_undersampled = pd.concat((X_majority, X_minority), ignore_index=True)
    y_undersampled = pd.concat((y_majority, y_minority), ignore_index=True)

    # 打乱数据集
    X_undersampled, y_undersampled = shuffle(X_undersampled, y_undersampled, random_state=42)

    # 检查SMOTE过采样后的类别分布  
    undersampled_counter = Counter(y_undersampled)
    print("undersample dataset shape", undersampled_counter)

    return X_undersampled, y_undersampled

In [None]:
# 故障可视化
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
import logging

values = X_train.value_counts()[1:]
categories = X_train.value_counts()[1:].index

# 将matplotlib的日志级别设置为'ERROR'，以屏蔽'WARNING'级别的信息
logging.getLogger('matplotlib').setLevel(logging.ERROR)

myfont = font_manager.FontProperties(fname='/home/aistudio/external-libraries/Fonts/SIMHEI.TTF')
# 创建条形图
bars = plt.bar(categories, values)

# 自定义图表
plt.title('故障统计',fontproperties=myfont)
plt.xlabel('故障类别',fontproperties=myfont) 
plt.ylabel('Values')
plt.xticks(categories)

# 遍历每个条形，添加数量标签
for bar in bars:
    yval = bar.get_height()  # 获取条形的高度（数量）
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.1, str(yval), 
             ha='center', va='bottom')  # 在条形上方添加文本

# 显示图表
plt.show()

In [None]:
sampling_strategy = {
    6: 50000, 
    8: 50000, 
    9: 50000, 
}
X_train,y_train = train_data[feature_columns], train_data['机器状态']
X_test_1, y_test_1 = test_data["M201"][feature_columns],test_data["M201"]['机器状态']
X_test_2, y_test_2 = test_data["M202"][feature_columns],test_data["M202"]['机器状态']

X_train, y_train = smote_resample(X_train, y_train, sampling_strategy)
X_train, y_train = undersample(X_train, y_train)

## 模型构建

In [None]:
import random 
from lightgbm import LGBMClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
import catboost as cat
from sklearn.svm import SVC
import sklearn.svm as svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold,train_test_split,cross_val_score as CVS
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix,mean_squared_error,recall_score

### XGBoost

In [None]:
# 创建XGBoost分类器的基础设置  
xgb_base = XGBClassifier(objective = 'multi:softmax',num_class = 10)  

# 定义要搜索的超参数分布  
param_distributions = {
    'n_estimators': sp_randint(100,300),  
    'learning_rate': [0.05, 0.08, 0.1], 
    'max_depth': range(3,10,2),
    'subsample': [0.6, 0.8, 1.0]
}

# 使用 make_scorer 包装自定义评分函数
scorer = make_scorer(Fault_Accuracy)

# 创建随机搜索对象  
random_search = RandomizedSearchCV(
    estimator=xgb_base,  
    param_distributions = param_distributions,  
    n_iter=10,  # 尝试的参数组合数量  
    scoring = scorer,  
    cv=5,  # 交叉验证的折数  
    n_jobs=-1,  # 使用所有可用的CPU核心
    verbose=10,  # 输出进度信息
    random_state=1024,
)  

# 使用编码后的标签和特征数据训练模型，并进行超参数搜索
random_search.fit(X_train, y_train)

In [None]:
# 获取最佳参数
xbg_best_params = random_search.best_params_  
print("Best Parameters: ", xbg_best_params)

# 获取最佳模型  
best_xgb = random_search.best_estimator_

In [None]:
xgb = XGBClassifier(
              objective = 'multi:softmax',
              max_depth = 9,
              n_estimators = 170, 
              num_class = 10,
              learning_rate=0.08,
              subsample = 0.6,
              verbose=-1,)

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_dev)

print('Accuracy : %.6f' % (accuracy_score(y_dev, y_pred)))
print('Fault_Accuracy : %.6f' % (Fault_Accuracy(y_dev, y_pred)))
print('Warning_Accuracy : %.6f' % (Warning_Accuracy(y_dev, y_pred)))
Recall(y_dev, y_pred)

In [None]:
# 混淆矩阵
cm = confusion_matrix(y_dev, y_pred)

# 使用 Seaborn 绘制混淆矩阵的热图
plt.rcParams['font.sans-serif'] = ['DejaVu Serif']
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# 设置图表标题和坐标轴标签
plt.title('confusion_matrix')
plt.xlabel('y_pred')
plt.ylabel('y_dev')

# 显示图表
plt.show()

### RF

In [None]:
# 创建随机森林分类器  
rf_base = RandomForestClassifier(random_state=1024) 

param_dist = {  
    "n_estimators": sp_randint(100, 300),  
    "max_depth": sp_randint(3, 10),  
    "max_features": sp_randint(1, 11),  
    "min_samples_split": sp_randint(2, 11),  
}

# 使用 make_scorer 包装自定义评分函数
scorer = make_scorer(Fault_Accuracy)

# 创建随机搜索对象  
random_search = RandomizedSearchCV(  
    estimator=rf_base,  
    param_distributions=param_dist,  
    n_iter=20,  # 尝试10组不同的参数
    scoring = scorer,  
    cv=5,  # 使用5折交叉验证  
    n_jobs=-1,  # 使用所有可用的处理器  
    random_state=1024,
    verbose = 10
)

# 使用编码后的标签和特征数据训练模型，并进行超参数搜索
random_search.fit(X_train, y_train)

In [None]:
# 获取最佳参数
rf_best_params = random_search.best_params_  
print("Best Parameters: ", rf_best_params)

# 获取最佳模型  
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_dev)

print('Accuracy : %.6f' % (accuracy_score(y_dev, y_pred)))
print('Fault_Accuracy : %.6f' % (Fault_Accuracy(y_dev, y_pred)))
print('Warning_Accuracy : %.6f' % (Warning_Accuracy(y_dev, y_pred)))
Recall(y_dev, y_pred)

### CatBoost

In [None]:
 # 初始化CatBoost分类器  
cat_base = CatBoostClassifier(loss_function='MultiClass', early_stopping_rounds=50,random_state=1024,verbose = 300)  

# 定义超参数搜索空间  
param_grid = {  
    'iterations': [500, 800, 1000],  
    'learning_rate': [0.03, 0.05, 0.08],  
    'depth': [5, 6, 8],  
    'l2_leaf_reg': [1, 3, 5]
}

# 使用 make_scorer 包装自定义评分函数
scorer = make_scorer(Fault_Accuracy)

# 使用RandomizedSearchCV进行超参数调优  
random_search = RandomizedSearchCV(  
    estimator=cat_base,  
    param_distributions=param_grid,  
    n_iter=10,  # 尝试10组不同的超参数组合
    cv=5,  # 使用5折交叉验证
    scoring = scorer,  # 根据自定义评分函数来评价模型性能  
    n_jobs=-1,  # 使用所有可用的CPU核心
    random_state=1024,
    verbose = 10
)

# 使用编码后的标签和特征数据训练模型，并进行超参数搜索
random_search.fit(X_train, y_train)

In [None]:
# 获取最佳参数
cat_best_params = random_search.best_params_  
print("Best Parameters: ", cat_best_params) 

# 获取最佳模型
best_cat = random_search.best_estimator_  
y_pred_cat = best_cat.predict(X_dev)  
  
# 输出分类报告  
print('Accuracy : %.6f' % (accuracy_score(y_dev, y_pred_cat)))
print('Fault_Accuracy : %.6f' % (Fault_Accuracy(y_dev, y_pred_cat)))
print('Warning_Accuracy : %.6f' % (Warning_Accuracy(y_dev, y_pred_cat)))
Recall(y_dev, y_pred_cat)

## Soft Voting Classifier

### 交叉验证+Soft Voting

In [None]:
from sklearn.model_selection import StratifiedKFold
import gc

def build_model(model_type):
    if model_type == "lgb":
        return LGBMClassifier(
            objective="multiclass",
            num_class=10,
            n_estimators=300
        )

    elif model_type == "xgb":
        return XGBClassifier(
            objective="multi:softprob",
            num_class=10,
            learning_rate=0.08,
            eval_metric="mlogloss"
        )

    elif model_type == "cat":
        return CatBoostClassifier(
            iterations=500,
            verbose=0,
            loss_function="MultiClass"
        )

    elif model_type == "rf":
        return RandomForestClassifier(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )

    elif model_type == "lr":
        return LogisticRegression(
            max_iter=1000,
            multi_class="multinomial",
            n_jobs=-1
        )

    else:
        raise ValueError("Unknown model type")
    
def cv_model(data_, test_, y_, model_type, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    n_classes = len(np.unique(y_))
    oof_preds = np.zeros((len(data_), n_classes))
    test_preds = np.zeros((len(test_), n_classes))

    for fold, (tr_idx, val_idx) in enumerate(skf.split(data_, y_)):
        print(f"\n===== Fold {fold+1} =====")

        X_tr, y_tr = data_.iloc[tr_idx], y_.iloc[tr_idx]
        X_val, y_val = data_.iloc[val_idx], y_.iloc[val_idx]

        model = build_model(model_type)
        model.fit(X_tr, y_tr)

        oof_preds[val_idx] = model.predict_proba(X_val)
        test_preds += model.predict_proba(test_) / n_splits

        val_pred = np.argmax(oof_preds[val_idx], axis=1)

        print("Acc:", accuracy_score(y_val, val_pred))
        print("Fault Acc:", Fault_Accuracy(y_val, val_pred))

        del model
        gc.collect()

    print("\n===== Overall =====")
    final_pred = np.argmax(oof_preds, axis=1)
    print("Acc:", accuracy_score(y_, final_pred))
    print("Fault Acc:", Fault_Accuracy(y_, final_pred))
    print("Warning Acc:", Warning_Accuracy(y_, final_pred))
    Recall(y_, final_pred)

    return test_preds

def cv_soft_voting(data_, test_, y_):
    models = ['xgb', 'rf', 'cat']
    voting_result = np.zeros((test_.shape[0], 10))
    
    for m in models:
        print('=======================')
        print(f"Training {m} ...")
        sub_pred = cv_model(data_, test_, y_, model_type=m)
        voting_result += sub_pred / len(models)
        print(f"{m} done.")
        print('=======================')
    
    pred = np.argmax(voting_result, axis=1)
    return pred

In [None]:
y_pred_cv_voting = cv_soft_voting(X_train, X_dev, y_train)

In [None]:
print('Accuracy : %.6f' % (accuracy_score(y_dev, y_pred_cv_voting)))
print('Fault_Accuracy : %.6f' % (Fault_Accuracy(y_dev, y_pred_cv_voting)))
print('Warning_Accuracy : %.6f' % (Warning_Accuracy(y_dev, y_pred_cv_voting)))
Recall(y_dev, y_pred_cv_voting)

In [None]:
# 最终预测，此处X_train仅是80%，实际预测时需使用所有数据
y_preds_1, final_pred_1 = cv_soft_voting(X_train, X_test_1, y_train)
y_preds_2, final_pred_2 = cv_soft_voting(X_train, X_test_2, y_train)

### Soft Voting

In [None]:
def voting_model(data_, test_, y_, model_type):
    n_classes = len(np.unique(y_))
    
    if model_type == 'lgb':
        model = LGBMClassifier(objective='multiclass', num_class=n_classes, verbose=-1)
    elif model_type == 'xgb':
        model = XGBClassifier(objective='multi:softprob', num_class=n_classes,
                              learning_rate=0.08, max_depth=9, n_estimators=170,
                              subsample=0.6, verbosity=0)
    elif model_type == 'cat':
        model = CatBoostClassifier(iterations=1000, learning_rate=0.03, depth=6,
                                   l2_leaf_reg=3, verbose=500, random_state=42,
                                   loss_function='MultiClass')
    elif model_type == 'rf':
        model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    else:
        raise ValueError("Unknown model type")

    model.fit(data_, y_)
    y_pred_ = model.predict(test_)
    sub_pred_ = model.predict_proba(test_)

    return y_pred_, sub_pred_

def soft_voting(data_, test_, y_):
    models = ['xgb', 'rf', 'cat']
    n_classes = len(np.unique(y_))
    voting_result = np.zeros((test_.shape[0], n_classes))
    y_preds = []

    for m in models:
        print(f"Training {m} ...")
        y_pred, sub_pred = voting_model(data_, test_, y_, model_type=m)
        y_preds.append(y_pred)
        voting_result += sub_pred / len(models)

    final_pred = np.argmax(voting_result, axis=1)
    return y_preds, final_pred

In [None]:
y_preds, y_pred_voting = soft_voting(X_train, X_dev, y_train)


In [None]:
print('Accuracy : %.6f' % (accuracy_score(y_dev, y_pred_voting)))
print('Fault_Accuracy : %.6f' % (Fault_Accuracy(y_dev, y_pred_voting)))
print('Warning_Accuracy : %.6f' % (Warning_Accuracy(y_dev, y_pred_voting)))
Recall(y_dev, y_pred_voting)

## stacking Classifier

In [None]:
# 基分类器
lgb_params = {
    'objective': 'multiclass',
    'num_class': 10,
    'verbose':-1,
    'n_estimators':100}

models = [
        XGBClassifier(
            objective = 'multi:softmax',
            num_class = 10,
            earning_rate=0.08,
            verbose=-1,
            ),

        CatBoostClassifier(
            iterations=1000,  # 最大迭代次数
            learning_rate=0.03,  # 学习率
            depth=6,  # 树的最大深度
            l2_leaf_reg=3,  # L2 正则化参数
            loss_function='MultiClass',  # 损失函数
            thread_count=4,  # 线程数
            random_state=42,  # 随机种子
            early_stopping_rounds=50,  # 早停的迭代次数
            verbose = 300,
            ),

        RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            )
        ]

### 交叉验证+stacking

In [None]:

def cv_stacking_model(data_, test_, y_, models):
    n_classes = len(np.unique(y_))
    X_train_stack = np.zeros((data_.shape[0], len(models) * n_classes))
    X_test_stack = np.zeros((test_.shape[0], len(models) * n_classes))

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, base_model in enumerate(models):
        print(f"Training base model {i+1}/{len(models)} ...")
        X_test_stack_fold = np.zeros((test_.shape[0], n_classes))

        for fold, (tr_idx, val_idx) in enumerate(skf.split(data_, y_)):
            print(f"Fold {fold+1}")
            tr_x, tr_y = data_.iloc[tr_idx], y_.iloc[tr_idx]
            val_x, val_y = data_.iloc[val_idx], y_.iloc[val_idx]

            # 每折重新复制模型
            model = deepcopy(base_model)
            
            # CatBoost 早停需要传 eval_set
            if isinstance(model, CatBoostClassifier):
                model.fit(tr_x, tr_y, eval_set=(val_x, val_y))
            else:
                model.fit(tr_x, tr_y)

            proba_val = model.predict_proba(val_x)
            proba_test = model.predict_proba(test_)

            X_train_stack[val_idx, i*n_classes:(i+1)*n_classes] = proba_val
            X_test_stack_fold += proba_test / skf.n_splits

            print('accuracy : %.6f    Fault_Accuracy : %.6f' % 
                  (accuracy_score(val_y, np.argmax(proba_val, axis=1)), 
                   Fault_Accuracy(val_y, np.argmax(proba_val, axis=1))))

        # 填充每个模型 stacking 测试集
        X_test_stack[:, i*n_classes:(i+1)*n_classes] = X_test_stack_fold

    # 第二层
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_stack, y_)
    final_pred = lr.predict(X_test_stack)

    return final_pred

In [None]:
y_pred_cv_stacking = cv_stacking_model(X_train, X_dev, y_train, models)
print('Accuracy : %.6f' % (accuracy_score(y_dev, y_pred_cv_stacking)))
print('Fault_Accuracy : %.6f' % (Fault_Accuracy(y_dev, y_pred_cv_stacking)))
print('Warning_Accuracy : %.6f' % (Warning_Accuracy(y_dev, y_pred_cv_stacking)))
Recall(y_dev, y_pred_cv_stacking)


In [None]:
# 最终预测，此处X_train仅是80%，实际预测时需使用所有数据
y_preds_1, final_pred_1 = cv_stacking_model(X_train, X_test_1, y_train, models)
y_preds_2, final_pred_2 = cv_stacking_model(X_train, X_test_2, y_train, models)


### stacking

In [None]:
def stacking_model(data_, test_, y_, models):
    n_classes = len(np.unique(y_))
    X_train_stack = np.zeros((data_.shape[0], len(models)*n_classes))
    X_test_stack = np.zeros((test_.shape[0], len(models)*n_classes))

    for i, model in enumerate(models):
        print(f"Training model {i+1}/{len(models)}")
        model.fit(data_, y_)
        
        # 训练集和测试集 stacking 特征
        X_train_stack[:, i*n_classes:(i+1)*n_classes] = model.predict_proba(data_)
        X_test_stack[:, i*n_classes:(i+1)*n_classes] = model.predict_proba(test_)

        # 训练集准确率（调试用）
        y_data = np.argmax(X_train_stack[:, i*n_classes:(i+1)*n_classes], axis=1)
        print(f"Training accuracy: {accuracy_score(y_, y_data):.6f}, Fault_Accuracy: {Fault_Accuracy(y_, y_data):.6f}")

    # 第二层
        lr = LogisticRegression(
        max_iter=1000,
        solver='lbfgs',
        penalty='l2', 
        C=1.0,
        )
    lr.fit(X_train_stack, y_)
    pred = lr.predict(X_test_stack)

    return pred

## 预测结果保存

In [None]:
test_data["M201"]['机器状态'] = final_pred_1
test_data["M202"]['机器状态'] = final_pred_2

test_data["M201"].to_csv('result/M201.csv', index=False)
test_data["M202"].to_csv('result/M202.csv', index=False)
