# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>1 |</span></b> <b>Importing Libraries</b></div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
!pip install catboost
!pip install lightgbm
!pip install xgboost

from datetime import datetime
from IPython.display import HTML as html_print
from termcolor import colored
from IPython.display import display

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="scipy")

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>2 |</span></b> <b>Adjusting Row & Column Settings</b></div>

In [None]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>3 |</span></b> <b>Loading The Data Set</b></div>

In [None]:
df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

In [None]:
def print_section_title(title):
    print(colored(title, 'blue', attrs=['bold', 'underline']))
    
def display_head_and_tail(dataframe, head=5):
    display(dataframe.head(head).style.set_caption("Head"))
    display(dataframe.tail(head).style.set_caption("Tail"))

def display_na(dataframe):
    na_df = dataframe.isnull().sum().reset_index()
    na_df.columns = ['Column', 'Number of NA']
    display(na_df.style.set_caption("Number of NA Values"))

def display_quantiles(dataframe):
    quantiles_df = dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T
    display(quantiles_df.style.format("{:.2f}").set_caption("Quantiles"))

def check_df(dataframe, head=5):
    print_section_title('Shape')
    print(dataframe.shape)
    print_section_title('Types')
    print(dataframe.dtypes.to_frame('Data Type').style.set_caption("Data Types"))
    print_section_title('Info')
    print(dataframe.info())
    print_section_title('Head & Tail')
    display_head_and_tail(dataframe, head)
    print_section_title('NA Values')
    display_na(dataframe)
    print_section_title('Quantiles')
    display_quantiles(dataframe)

In [None]:
check_df(df)

In [None]:
df.head()

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>4 |</span></b> <b>Capturing / Detecting Numeric and Categorical Variables</b></div>

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20, force_cat_cols=None, force_num_cols=None):
    """
    Returns the names of categorical, numeric and categorical but cardinal variables in the data set.
    Now with options to force specific columns to be treated as categorical or numeric.

    Parameters
    ------
        dataframe: dataframe
                Variable names of the dataframe to be taken
        cat_th: int, optional
                class threshold for numeric but categorical variables
        car_th: int, optional
                class threshold for categorical but cardinal variables
        force_cat_cols: list, optional
                List of column names to force treat as categorical
        force_num_cols: list, optional
                List of column names to force treat as numeric

    Returns
    ------
        cat_cols: list
                Categorical variable list
        num_cols: list
                Numeric variable list
        cat_but_car: list
                List of cardinal variables with categorical appearance
        num_but_cat: list
                List of numeric but categorical variables
    """
    # Initialize force_cat_cols and force_num_cols if not provided
    if force_cat_cols is None:
        force_cat_cols = []
    if force_num_cols is None:
        force_num_cols = []

    # 1. Get standard categorical columns (object type)
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    # 2. Get numeric but categorical columns (excluding force_num_cols)
    num_but_cat = [col for col in dataframe.columns
                   if dataframe[col].nunique() < cat_th
                   and dataframe[col].dtypes != "O"
                   and col not in force_num_cols]

    # 3. Get categorical but cardinal columns
    cat_but_car = [col for col in dataframe.columns
                   if dataframe[col].nunique() > car_th
                   and dataframe[col].dtypes == "O"]

    # 4. Combine standard categorical and numeric-but-categorical
    cat_cols = cat_cols + num_but_cat

    # 5. Remove cardinal columns from categorical
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # 6. Get numeric columns (including force_num_cols)
    num_cols = [col for col in dataframe.columns
                if dataframe[col].dtypes != "O"
                and col not in num_but_cat
                or col in force_num_cols]

    # 7. Handle forced categorical columns
    for col in force_cat_cols:
        if col in num_cols:
            num_cols.remove(col)
        if col in num_but_cat:
            num_but_cat.remove(col)
        if col in cat_but_car:
            cat_but_car.remove(col)
        if col not in cat_cols:
            cat_cols.append(col)

    # 8. Handle forced numeric columns
    for col in force_num_cols:
        if col in cat_cols:
            cat_cols.remove(col)
        if col in num_but_cat:
            num_but_cat.remove(col)
        if col in cat_but_car:
            cat_but_car.remove(col)
        if col not in num_cols:
            num_cols.append(col)

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    forced_caterical = (
  #  [f'M_dw{i}' for i in range(2, 7)] +  # M_dw2到M_dw6
    [f'R_dw{i}' for i in range(22, 27)] +  # R_dw22到R_dw26
    [f'R_dw{i}' for i in range(29, 34)]  # R_dw29到R_dw33
)
    forced_numerical = (
    [f'M_dw{i}' for i in range(2, 7)]   # M_dw2到M_dw6
   # [f'R_dw{i}' for i in range(22, 27)] +  # R_dw22到R_dw26
   # [f'R_dw{i}' for i in range(29, 34)]  # R_dw29到R_dw33
)
    return cat_cols, num_cols, cat_but_car, num_but_cat



# 指定要强制作为数值变量的列
#forced_numerical = ['m_dw1', 'm_dw4', 'm_dw5']

# 指定要强制作为分类变量的列
    # 生成强制数值列


In [None]:
cat_cols, num_cols, cat_but_car,  num_but_cat = grab_col_names(df)

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
cat_but_car

In [None]:
num_but_cat

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>5 |</span></b> <b>Analysis of Categorical Variables</b></div>

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        'Ratio': 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print('##########################################')
    if plot:
        plt.figure(figsize=(20,8))
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.xticks(rotation=90)  # Rotate x-axis labels by 90 degrees
        plt.show(block=True)

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
#
#
# df = pd.read_csv('D:/code/junma/600000/0424/1.csv')
# def plot_device_parameters(dataframe, device_name, subsystem_col='subsystem', time_col='created_at'):
#     """
#     绘制指定设备的所有参数随时间变化的图表
#
#     参数:
#     dataframe -- 包含设备数据的DataFrame
#     device_name -- 要绘制的设备名称
#     subsystem_col -- 包含子系统/设备名称的列名
#     time_col -- 包含时间戳的列名
#     """
#     # 筛选指定设备的数据
#     device_data = dataframe[dataframe[subsystem_col] == device_name]
#
#     if device_data.empty:
#         print(f"未找到设备: {device_name}")
#         return
#
#     # 获取所有数值型参数列（排除非数值列）
#     numeric_cols = device_data.select_dtypes(include=['number']).columns.tolist()
#
#     # 创建图表
#     plt.figure(figsize=(20, 10))
#
#     # 为每个数值参数创建子图
#     for i, col in enumerate(numeric_cols, 1):
#         plt.subplot(len(numeric_cols), 1, i)
#         sns.lineplot(x=time_col, y=col, data=device_data)
#         plt.title(f"{device_name} - {col} 随时间变化")
#         plt.xlabel('时间')
#         plt.ylabel(col)
#         plt.xticks(rotation=45)
#
#     plt.tight_layout()
#     plt.show()
#
# def analyze_devices(dataframe, subsystem_col='subsystem', time_col='created_at', sample_devices=None):
#     """
#     分析所有设备的时间序列数据
#
#     参数:
#     dataframe -- 包含设备数据的DataFrame
#     subsystem_col -- 包含子系统/设备名称的列名
#     time_col -- 包含时间戳的列名
#     sample_devices -- 可选，指定要分析的设备列表（None表示分析所有设备）
#     """
#      # 自动检测列名
#     if subsystem_col is None:
#         possible_names = ['subsystem', 'Subsystem', 'device', 'Device', 'device_name']
#         for name in possible_names:
#             if name in dataframe.columns:
#                 subsystem_col = name
#                 break
#         if subsystem_col is None:
#             raise ValueError("无法识别设备名称列，请手动指定 subsystem_col 参数")
#         print(f"使用的设备名称列: {subsystem_col}")
#     dataframe[time_col] = pd.to_datetime(dataframe[time_col])
#
#     # 确保时间列是datetime类型
#     dataframe[time_col] = pd.to_datetime(dataframe[time_col])
#
#     # 获取所有设备名称
#     all_devices = dataframe[subsystem_col].unique()
#
#     # 如果指定了样本设备，只分析这些设备
#     if sample_devices is not None:
#         devices_to_analyze = [d for d in sample_devices if d in all_devices]
#     else:
#         devices_to_analyze = all_devices
#
#     print(f"将分析 {len(devices_to_analyze)} 台设备的数据...")
#
#     # 为每台设备绘制图表
#     for device in devices_to_analyze:
#         print(f"\n正在分析设备: {device}")
#         plot_device_parameters(dataframe, device, subsystem_col, time_col)
#
#         # 添加设备参数的统计摘要
#         device_data = dataframe[dataframe[subsystem_col] == device]
#         print(f"\n设备 {device} 的参数统计摘要:")
#         print(device_data.describe())
#
# # 使用示例:
# # 1. 分析所有设备
# analyze_devices(df)
# # 选择第一个subsystem列
# # df = df.rename(columns={df.columns[0]: 'subsystem'})
# # df = df.loc[:, ~df.columns.duplicated()]  # 删除重复列
#
# # 2. 分析特定设备（例如NX16-L102）
# analyze_devices(df, sample_devices=['NX16-L102'])
#
# # 3. 分析前5台设备
# analyze_devices(df, sample_devices=df['subsystem'].unique()[:1])

In [None]:
for col in cat_cols:
    cat_summary(df, col, plot=True)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>6 |</span></b> <b>Analysis of Numerical Variables</b></div>

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

In [None]:
for col in num_cols:
    num_summary(df, col, plot=True)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>7 |</span></b> <b>Analysis of Categorical Variables by Target</b></div>

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col, plot=False):
    print(pd.DataFrame({'TARGET_MEAN': dataframe.groupby(categorical_col)[target].mean()}), end='\n\n\n')
    if plot:
        plt.figure(figsize=(20,8))
        sns.barplot(x=categorical_col, y=target, data=dataframe)
        plt.xticks(rotation=90)  
        plt.show(block=True)

In [None]:
df = pd.read_csv('D:/code/junma/600000/0424/2.csv')
# df = pd.DataFrame("D:/code/junma/600000/0424/1.csv")
# for index, row in df.iterrows():
#     for spindle in range(1, 101):
#             # 构建目标变量 (y)
#         is_break = 1 if row[f'D_dw{spindle}'] == 3 else 0
#         return DataFrame
# y = df['broken_status']
# 读取数据
# df = pd.read_csv('D:/code/junma/600000/0424/second_final.csv')
#
# # 向量化处理所有 D_dw 列
# for spindle in range(1, 101):
#     col_name = f'D_dw{spindle}'
#     if col_name in df.columns:
#         df[f'break_{spindle}'] = (df[col_name] == 3).astype(int)
#
# # 保存结果（可选）
# df.to_csv('processed_data.csv', index=False)


# # num_cols = [col for col in num_cols if col not in ["M_dw5"]]
# import pandas as pd
#
# # 读取CSV文件
# df = pd.read_csv('D:/code/junma/600000/0424/second_final.csv')
#
# # 为每个锭子(1-100)创建is_break列
# for spindle in range(1, 101):
#     df[f'is_break_dw{spindle}'] = df[f'D_dw{spindle}'].apply(lambda x: 1 if x == 3 else 0)
#
# # 现在df中新增了is_break_dw1到is_break_dw100列
# print(df.head())
#
# # 如果你想要一个汇总的is_break列(所有锭子中是否有断纱)
# df['any_break'] = df[[f'D_dw{i}' for i in range(1, 101)]].apply(lambda row: any(x == 3 for x in row), axis=1)
df[f'is_break_dw{1}'] = df[f'D_dw{1}'].apply(lambda x: 1 if x == 3 else 0)
# 检查 is_break_dw1 列中是否存在 1
if 1 in df[f'is_break_dw{1}'].values:
    print("is_break_dw1 列中存在值为 1 的元素。")
    # 展示值为 1 的元素的行索引
    rows_with_one = df[df[f'is_break_dw{1}'] == 1].index.tolist()
    print(f"值为 1 的元素所在的行索引为: {rows_with_one}")
    # 展示值为 1 的元素所在的完整行
    print("值为 1 的元素所在的完整行信息：")
    print(df[df[f'is_break_dw{1}'] == 1])
else:
    print("is_break_dw1 列中不存在值为 1 的元素。")

# 对 is_break_dw1 列中 0 和 1 的个数进行计数
count_zeros = (df[f'is_break_dw{1}'] == 0).sum()
count_ones = (df[f'is_break_dw{1}'] == 1).sum()

print(f"is_break_dw1 列中值为 0 的元素个数为: {count_zeros}")
print(f"is_break_dw1 列中值为 1 的元素个数为: {count_ones}")

# 输出 is_break_dw1 列的值
print("is_break_dw1 列的值为：")
print(df[f'is_break_dw{1}'].values)
print(df[f'is_break_dw{1}'].values)

In [None]:
for col in cat_cols:
    target_summary_with_cat(df, 'is_break_dw1', col, plot=True)
# # 数据预处理
# # 1. 检查并预处理数据
# print("broken_status唯一值:", df['broken_status'].unique())
# print("类别列缺失值统计:")
# print(df[cat_cols].isnull().sum())
#
# # 更新映射字典以包含所有可能的值
# value_mapping = {'正常':0, '断纱':1, '完好':0, '断纱':1}  # 根据实际情况添加
# df['broken_status'] = df['broken_status'].map(value_mapping).fillna(-1)  # 未映射的值设为-1
#
# # 2. 处理缺失值
# df = df.dropna(subset=cat_cols + ['broken_status'])  # 删除关键列缺失的行
#
# # 3. 修改后的函数
# def target_summary_with_cat(df, target_col, cat_col, plot=True):
#     # 检查列是否存在
#     if cat_col not in df.columns or target_col not in df.columns:
#         print(f"错误: 列 {cat_col} 或 {target_col} 不存在")
#         return None
#
#     # 执行分组
#     result = df.groupby(cat_col)[target_col].mean()
#
#     # 检查结果是否为空
#     if result.empty:
#         print(f"警告: {cat_col} 列分组结果为空")
#         return None
#
#     # 可视化
#     if plot:
#         result.plot(kind='bar')
#         plt.title(f'{cat_col} vs {target_col}')
#         plt.ylabel('断纱比例')
#         plt.show()
#
#     return result
#
# # 4. 执行分析
# for col in cat_cols:
#     target_summary_with_cat(df, 'broken_status', col, plot=True)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>8 |</span></b> <b>Analysis of Numeric Variables by Target</b></div>

In [None]:
def target_summary_with_num(dataframe, target, numerical_col, plot=False):
    print(pd.DataFrame({numerical_col+'_mean': dataframe.groupby(target)[numerical_col].mean()}), end='\n\n\n')
    if plot:
        sns.barplot(x=target, y=numerical_col, data=dataframe)
        plt.show(block=True)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>9 |</span></b> <b>Analysis of Correlation</b></div>

In [None]:
def high_correlated_cols(dataframe, plot=False, corr_th=0.70):
    corr = dataframe.corr(numeric_only=True) 
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))  # np.bool changed to bool
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (16, 14)})
        sns.heatmap(corr, cmap="RdBu", annot=True, fmt=".2f")
        plt.show()
    return drop_list

In [None]:
high_correlated_cols(df, plot=True)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>10 |</span></b> <b>Distribution of the Dependent Variable</b></div>

In [None]:
df["D_dw1"].hist(bins=100)
plt.show(block=True)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>11 |</span></b> <b>Examining the Logarithm of the Dependent Variable</b></div>

In [None]:
# np.log1p(df['broken_status']).hist(bins=50)
# plt.show(block=True)
df['D_dw1'].value_counts().plot(kind='bar')  # 类别频数直方图
plt.title('D_dw1 类别分布')
plt.show()

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>12 |</span></b> <b>Outliers Analysis</b></div>

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
for col in num_cols:
    if check_outlier(df, col):
        replace_with_thresholds(df, col)

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>13 |</span></b> <b>Missing Value Analysis</b></div>

<div style="border: 2px solid #007BFF; padding: 20px; border-radius: 10px; background-color: #f9f9f9;">
    <h2 style="color: #007BFF;">Missing Value Analysis</h2>
    <p>I have handled the Feature Engineering processes in the <strong>GENERAL REVIEW OF BIOGAS IN U.S. FARMS</strong> section.</p>
    <p>You can access it from the link below:</p>
    <a href="https://www.kaggle.com/code/mehmetisik/general-review-of-biogas-in-u-s-farms" style="background-color: #007BFF; color: #FFFFFF; padding: 10px 20px; text-decoration: none; border-radius: 5px;">Click here to access</a>
</div>

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)

    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)

    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])

    print(missing_df, end="\n")

    if na_name:
        return na_columns

In [None]:
missing_values_table(df)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>14 |</span></b> <b>Rare Analysis</b></div>

In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ':', len(dataframe[col].value_counts()))
        print(pd.DataFrame({'COUNT': dataframe[col].value_counts(),
                            'RATIO': dataframe[col].value_counts() / len(dataframe),
                            'TARGET_MEAN': dataframe.groupby(col)[target].mean()}), end='\n\n\n')

In [None]:
rare_analyser(df, "D_dw1", cat_cols)

In [None]:
def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

In [None]:
rare_encoder(df, 0.01)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>15 |</span></b> <b>Feature Extraction</b></div>

<div style="border: 2px solid #007BFF; padding: 20px; border-radius: 10px; background-color: #f9f9f9;">
    <h2 style="color: #007BFF;">Feature Engineering</h2>
    <p>I have handled the Feature Engineering processes in the <strong>GENERAL REVIEW OF BIOGAS IN U.S. FARMS</strong> section.</p>
    <p>You can access it from the link below:</p>
    <a href="https://www.kaggle.com/code/mehmetisik/general-review-of-biogas-in-u-s-farms" style="background-color: #007BFF; color: #FFFFFF; padding: 10px 20px; text-decoration: none; border-radius: 5px;">Click here to access</a>
</div>

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>16 |</span></b> <b>Encoding</b></div>

In [None]:
# 定义要强制作为数值变量的列
forced_caterical = (
   # [f'M_dw{i}' for i in range(2, 7)] +  # M_dw2到M_dw6
    [f'R_dw{i}' for i in range(22, 27)] +  # R_dw22到R_dw26
    [f'R_dw{i}' for i in range(29, 34)]  # R_dw29到R_dw33
)
forced_numerical = (
    [f'M_dw{i}' for i in range(2, 7)]   # M_dw2到M_dw6
   # [f'R_dw{i}' for i in range(22, 27)] +  # R_dw22到R_dw26
   # [f'R_dw{i}' for i in range(29, 34)]  # R_dw29到R_dw33
)
# 调用函数时传入强制参数
cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(
    df,
    force_cat_cols=forced_caterical,
    force_num_cols=forced_numerical
)

In [None]:
cat_cols


In [None]:
# cat_cols = ['Project Type',
#  'Digester Type',
#  'Status',
#  'Animal/Farm Type(s)',
#  'Co-Digestion',
#  'Biogas End Use(s)',
#  'LCFS Pathway?',
#  'Receiving Utility',
#  'Awarded USDA Funding?']

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
# num_cols = ['Year Operational',
#  'Dairy',
#  'Biogas Generation Estimate (cu-ft/day)',
#  'Electricity Generated (kWh/yr)',
#  'Total Emission Reductions (MTCO2e/yr)',
#  'Operational Years',
#  'Total_Animals',
#  'Biogas_per_Animal (cu-ft/day)',
#  'Emission_Reduction_per_Year',
#  'Electricity_to_Biogas_Ratio',
#  'Total_Waste_kg/day',
#  'Waste_Efficiency',
#  'Electricity_Efficiency', 'Cattle',
#  'Poultry',
#  'Swine']
num_cols = [
  'M_dw4',
 'M_dw5',
 'M_dw6'
            ]

In [None]:
num_cols

In [None]:
cat_but_car

In [None]:
num_but_cat


In [None]:
# num_but_cat =[]

In [None]:
num_but_cat

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

# def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
#     # 创建原始列的副本
#     original_cols = dataframe[categorical_cols].copy()
#
#     # 进行独热编码
#     dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
#
#     # 将原始列添加回数据框
#     for col in categorical_cols:
#         dataframe[col + 'broken_status'] = original_cols[col]
#
#     return dataframe
# df.head()

In [None]:
# df = one_hot_encoder(df, cat_cols, drop_first=True)
df = one_hot_encoder(df, cat_cols,drop_first=True)

In [None]:
df.head(1000)

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>17 |</span></b> <b>Standardization Process</b></div>

In [None]:
# for spindle in range(1, 101):
#             # 构建目标变量 (y)
#             is_break = 1 if row[f'D_dw{spindle}'] == 3 else 0
# y = is_break
#
# num_cols = [col for col in num_cols if col not in ["is_break"]]
y =df['is_break_dw1_1']

In [None]:
scaler = RobustScaler()

In [None]:
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
df.head(1000)

In [None]:
# Editing of variable names.
# X = df.drop(["M_dw5","created_at","name","name1","C_dw3","C_dw4","M_dw3","M_dw2","C_dw2","time","unnamed_0","R_dw33"], axis=1)
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
df.columns = df.columns.str.lower()

In [None]:

df.head(1000)

In [None]:
df.shape

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>18 |</span></b> <b>Creating Model</b></div>

In [None]:
# for spindle in range(1, 101):
#             # 构建目标变量 (y)
#             is_break = 1 if row[f'D_dw{spindle}'] == 3 else 0


In [None]:
# y = df.m_dw5
# y = is_break
y =df['is_break_dw1_1']
# df.shape

In [None]:
X = df.drop(["m_dw1_1","m_dw1_2","m_dw1_3","m_dw6","is_break_dw1_1","d_dw1_1","d_dw1_2","d_dw1_3","m_dw5","created_at","name","subsystem1","subsystem","c_dw3","c_dw4","m_dw3","m_dw2","c_dw2","unnamed_0","r_dw33_28200","r_dw33_27700","r_dw33_28400","r_dw33_32200","r_dw33_32600","r_dw33_33400","r_dw33_38200"], axis=1)
# X = df.drop(["is_break_dw1_1","m_dw5","created_at","name","subsystem1","subsystem","c_dw3","c_dw4","c_dw2","unnamed_0","r_dw33_28200","r_dw33_27700","r_dw33_28400","r_dw33_32200","r_dw33_32600","r_dw33_33400","r_dw33_38200"], axis=1)
# X = df.drop(["r_dw33",'device_name','spindle','broken_start_time','prev_time','next_time','prev_status','next_status'], axis=1)
# # 1. 定义需要保留的列（R_dw和C_dw相关）
# selected_features = [
#     'r_dw22','r_dw22', 'r_dw23', 'r_dw24', 'r_dw25', 'r_dw26',
#     'r_dw29', 'r_dw30', 'r_dw31', 'r_dw32', 'r_dw33', 'c_dw2', 'c_dw3', 'c_dw4'
# ]
#
# # 2. 直接从原始数据框df中提取这些列（删除其他所有列）
# X = df[selected_features].copy()  # 使用.copy()避免SettingWithCopyWarning
#
# # 3. 验证结果（可选）
# print("保留的列：", X.columns.tolist())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)
# import pandas as pd
# import numpy as np
#
# # 确保时间列是datetime类型
# df['created_at'] = pd.to_datetime(df['created_at'])
#
# # 按时间排序
# df = df.sort_values('created_at').reset_index(drop=True)
#
# # 计算总时间跨度
# start_time = df['created_at'].min()
# end_time = df['created_at'].max()
# total_duration = end_time - start_time
#
# # 计算9.6小时的训练集截止时间点
# train_duration = pd.Timedelta(hours=7.62)  # 9小时36分钟
# cutoff_time = start_time + train_duration
#
# print(f"数据总时间跨度: {total_duration}")
# print(f"训练集截止时间: {cutoff_time}")
#
# # 基于时间划分训练集和测试集
# train_mask = df['created_at'] <= cutoff_time
# test_mask = df['created_at'] > cutoff_time
#
# X_train = X[train_mask]
# X_test = X[test_mask]
# y_train = y[train_mask]
# y_test = y[test_mask]

print(f"训练集大小: {len(X_train)}")
print(f"测试集大小: {len(X_test)}")
print(f"训练集比例: {len(X_train)/len(X):.2%}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)
# import pandas as pd
# import numpy as np

# # 确保时间列是datetime类型
# df['created_at'] = pd.to_datetime(df['created_at'])
#
# # 按时间排序
# df = df.sort_values('created_at').reset_index(drop=True)
#
# # 计算总时间跨度
# start_time = df['created_at'].min()
# end_time = df['created_at'].max()
# total_duration = end_time - start_time
#
# # 计算7.62小时的训练集截止时间点
# train_duration = pd.Timedelta(hours=7.62)  # 7小时37分钟
# cutoff_time = start_time + train_duration
#
# print(f"数据总时间跨度: {total_duration}")
# print(f"训练集截止时间: {cutoff_time}")
#
# # 基于时间划分训练集和测试集
# train_mask = df['created_at'] <= cutoff_time
# test_mask = df['created_at'] > cutoff_time
#
# X_train = X[train_mask]
# X_test = X[test_mask]
# y_train = y[train_mask]
# y_test = y[test_mask]
#
# print(f"训练集大小: {len(X_train)}")
# print(f"测试集大小: {len(X_test)}")
# print(f"训练集比例: {len(X_train)/len(X):.2%}")
#
# # 统计断纱事件（y==1）的数量和占比
# print("\n=== 断纱事件统计 ===")

# 训练集断纱事件统计
# train_break_count = np.sum(y_train == 1)
# train_break_ratio = train_break_count / len(y_train)
#
# # 测试集断纱事件统计
# test_break_count = np.sum(y_test == 1)
# test_break_ratio = test_break_count / len(y_test)
#
# # 总体断纱事件统计
# total_break_count = np.sum(y == 1)
# total_break_ratio = total_break_count / len(y)
#
# print(f"训练集断纱事件数量: {train_break_count} (占比: {train_break_ratio:.2%})")
# print(f"测试集断纱事件数量: {test_break_count} (占比: {test_break_ratio:.2%})")
# print(f"总体断纱事件数量: {total_break_count} (占比: {total_break_ratio:.2%})")
#
# # 检查类别分布是否均衡
# print("\n=== 类别分布分析 ===")
# print(f"训练集类别分布 - 正常事件: {len(y_train) - train_break_count}, 断纱事件: {train_break_count}")
# print(f"测试集类别分布 - 正常事件: {len(y_test) - test_break_count}, 断纱事件: {test_break_count}")
#
# # 如果类别不平衡严重，给出警告
# if train_break_ratio < 0.05 or train_break_ratio > 0.95:
#     print("警告: 训练集类别不平衡严重，可能影响模型性能!")
# if test_break_ratio < 0.05 or test_break_ratio > 0.95:
#     print("警告: 测试集类别不平衡严重，评估结果可能不可靠!")
#
# # 计算断纱事件的时间分布
# print("\n=== 断纱事件时间分布 ===")
# break_events = df[y == 1] if 'y' in df.columns else df[df.index.isin(np.where(y == 1)[0])]
# train_break_events = break_events[break_events['created_at'] <= cutoff_time]
# test_break_events = break_events[break_events['created_at'] > cutoff_time]
#
# print(f"训练集断纱事件时间范围: {train_break_events['created_at'].min()} 到 {train_break_events['created_at'].max()}")
# print(f"测试集断纱事件时间范围: {test_break_events['created_at'].min()} 到 {test_break_events['created_at'].max()}")

# # 断纱事件在时间上的分布密度
# if len(train_break_events) > 0:
#     train_hours = (train_break_events['created_at'] - start_time).dt.total_seconds() / 3600
#     print(f"训练集断纱事件平均发生时间: {train_hours.mean():.2f} 小时后")
#
# if len(test_break_events) > 0:
#     test_hours = (test_break_events['created_at'] - cutoff_time).dt.total_seconds() / 3600
#     print(f"测试集断纱事件平均发生时间: {test_hours.mean():.2f} 小时后")

In [None]:
# 导入必要的库
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 模型列表
models = [
    # ('LR', LogisticRegression()),
    # ('KNN', KNeighborsClassifier()),
    # ('CART', DecisionTreeClassifier()),
    ('RF', RandomForestClassifier()),
    # ('GBM', GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ('LightGBM', LGBMClassifier()),
    # ('CatBoost', CatBoostClassifier(verbose=False))
]

# 初始化空列表以存储性能指标和执行时间
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
execution_times = []
model_names = []

# 主循环：训练、预测和评估每个模型
for name, classifier in models:
    start_time = time.time()

    # 训练模型
    classifier.fit(X_train, y_train)

    # 预测
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]  # 用于 ROC-AUC

    # 计算 Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

    # 计算 Precision
    precision = precision_score(y_test, y_pred)
    precision_scores.append(precision)

    # 计算 Recall
    recall = recall_score(y_test, y_pred)
    recall_scores.append(recall)

    # 计算 F1-Score
    f1 = f1_score(y_test, y_pred)
    f1_scores.append(f1)

    # 计算 ROC-AUC
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    roc_auc_scores.append(roc_auc)

    # 计算模型执行时间
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    # 存储模型名称
    model_names.append(name)

# 创建 DataFrame 以存储所有性能指标和执行时间
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-Score': f1_scores,
    'ROC-AUC': roc_auc_scores,
    'Execution Time (s)': execution_times
})

# 显示结果
results_df

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# 确保时间列是datetime类型
# df['created_at'] = pd.to_datetime(df['created_at'])
#
# # 按时间排序
# df = df.sort_values('created_at').reset_index(drop=True)
#
# # 计算总时间跨度
# start_time = df['created_at'].min()
# end_time = df['created_at'].max()
# total_duration = end_time - start_time
#
# # 计算7.62小时的训练集截止时间点
# train_duration = pd.Timedelta(hours=7.62)  # 7小时37分钟
# cutoff_time = start_time + train_duration
#
# print(f"数据总时间跨度: {total_duration}")
# print(f"训练集截止时间: {cutoff_time}")
#
# # 基于时间划分训练集和测试集
# train_mask = df['created_at'] <= cutoff_time
# test_mask = df['created_at'] > cutoff_time
#
# X_train = X[train_mask]
# X_test = X[test_mask]
# y_train = y[train_mask]
# y_test = y[test_mask]

print(f"训练集大小: {len(X_train)}")
print(f"测试集大小: {len(X_test)}")
print(f"训练集比例: {len(X_train)/len(X):.2%}")

# 统计断纱事件（y==1）的数量和占比
print("\n=== 断纱事件统计 ===")

# 训练集断纱事件统计
train_break_count = np.sum(y_train == 1)
train_break_ratio = train_break_count / len(y_train)

# 测试集断纱事件统计
test_break_count = np.sum(y_test == 1)
test_break_ratio = test_break_count / len(y_test)

# 总体断纱事件统计
total_break_count = np.sum(y == 1)
total_break_ratio = total_break_count / len(y)

print(f"训练集断纱事件数量: {train_break_count} (占比: {train_break_ratio:.2%})")
print(f"测试集断纱事件数量: {test_break_count} (占比: {test_break_ratio:.2%})")
print(f"总体断纱事件数量: {total_break_count} (占比: {total_break_ratio:.2%})")

# 检查类别分布是否均衡
print("\n=== 类别分布分析 ===")
print(f"训练集类别分布 - 正常事件: {len(y_train) - train_break_count}, 断纱事件: {train_break_count}")
print(f"测试集类别分布 - 正常事件: {len(y_test) - test_break_count}, 断纱事件: {test_break_count}")

# 如果类别不平衡严重，给出警告和处理建议
if train_break_ratio < 0.05 or train_break_ratio > 0.95:
    print("警告: 训练集类别不平衡严重，可能影响模型性能!")
    print("处理建议:")
    print("1. 使用类别权重 (class_weight='balanced')")
    print("2. 使用过采样技术 (如SMOTE)")
    print("3. 使用欠采样技术")
    print("4. 使用合适的评估指标 (如F1-score, ROC-AUC)")

if test_break_ratio < 0.05 or test_break_ratio > 0.95:
    print("警告: 测试集类别不平衡严重，评估结果可能不可靠!")
    print("处理建议:")
    print("1. 使用分层抽样确保测试集代表性")
    print("2. 使用合适的评估指标 (如F1-score, ROC-AUC)")

# # 计算断纱事件的时间分布
# print("\n=== 断纱事件时间分布 ===")
# break_events = df[y == 1] if 'y' in df.columns else df[df.index.isin(np.where(y == 1)[0])]
# train_break_events = break_events[break_events['created_at'] <= cutoff_time]
# test_break_events = break_events[break_events['created_at'] > cutoff_time]
#
# if len(train_break_events) > 0:
#     print(f"训练集断纱事件时间范围: {train_break_events['created_at'].min()} 到 {train_break_events['created_at'].max()}")
#     train_hours = (train_break_events['created_at'] - start_time).dt.total_seconds() / 3600
#     print(f"训练集断纱事件平均发生时间: {train_hours.mean():.2f} 小时后")
#
# if len(test_break_events) > 0:
#     print(f"测试集断纱事件时间范围: {test_break_events['created_at'].min()} 到 {test_break_events['created_at'].max()}")
#     test_hours = (test_break_events['created_at'] - cutoff_time).dt.total_seconds() / 3600
#     print(f"测试集断纱事件平均发生时间: {test_hours.mean():.2f} 小时后")

# 处理类别不平衡的解决方案
print("\n=== 类别不平衡处理方案 ===")

# 1. 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
print(f"类别权重: 正常事件={class_weights[0]:.2f}, 断纱事件={class_weights[1]:.2f}")

# 2. 显示过采样前后的数据分布
print(f"\n过采样前训练集分布: 正常事件={len(y_train) - train_break_count}, 断纱事件={train_break_count}")
print("应用SMOTE过采样后，断纱事件将与正常事件数量相等")

# 3. 修改模型定义，加入类别权重和采样策略
print("\n=== 修改模型定义以处理类别不平衡 ===")

# 支持类别权重的模型
models_balanced = [
    ('RF_balanced', RandomForestClassifier(class_weight='balanced', random_state=17)),
    ('XGBoost_balanced', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=(len(y_train) - train_break_count) / train_break_count  # 设置正例权重
    )),
    ('LightGBM_balanced', LGBMClassifier(
        class_weight='balanced',
        random_state=17
    )),
]

# 创建采样器
smote = SMOTE(random_state=17)
under_sampler = RandomUnderSampler(random_state=17)

# 使用采样器的模型管道
models_sampled = [
    ('RF_sampled', Pipeline([
        ('smote', smote),
        ('classifier', RandomForestClassifier(random_state=17))
    ])),
    ('XGBoost_sampled', Pipeline([
        ('smote', smote),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ])),
    ('LightGBM_sampled', Pipeline([
        ('smote', smote),
        ('classifier', LGBMClassifier(random_state=17))
    ])),
]

print("已创建处理类别不平衡的模型:")
print("1. 使用类别权重的模型: RF_balanced, XGBoost_balanced, LightGBM_balanced")
print("2. 使用SMOTE过采样的模型: RF_sampled, XGBoost_sampled, LightGBM_sampled")

# 评估指标建议
print("\n=== 评估指标建议 ===")
print("对于不平衡数据，建议重点关注以下指标:")
print("1. F1-Score: 精确率和召回率的调和平均")
print("2. ROC-AUC: 不受类别分布影响")
print("3. Precision-Recall曲线下面积: 更适合不平衡数据")
print("4. 召回率(Recall): 确保尽可能多地识别断纱事件")

# 最终建议
print("\n=== 最终建议 ===")
print("1. 优先使用带类别权重的模型或SMOTE过采样模型")
print("2. 重点关注F1-Score和ROC-AUC指标")
print("3. 考虑使用Precision-Recall曲线评估模型性能")
print("4. 在实际应用中，可能需要根据业务需求调整分类阈值")

In [None]:
# # 导入必要的库
# import time
# import numpy as np
# import pandas as pd
# from sklearn import metrics
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
#
# # 模型列表
# models = [
#     # ('LR', LogisticRegression()),
#     # ('KNN', KNeighborsClassifier()),
#     # ('CART', DecisionTreeClassifier()),
#     ('RF', RandomForestClassifier()),
#     # ('GBM', GradientBoostingClassifier()),
#     ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
#     ('LightGBM', LGBMClassifier()),
#     # ('CatBoost', CatBoostClassifier(verbose=False))
# ]
#
# # 初始化空列表以存储性能指标和执行时间
# accuracy_scores = []
# precision_scores = []
# recall_scores = []
# f1_scores = []
# roc_auc_scores = []
# execution_times = []
# model_names = []
#
# # 主循环：训练、预测和评估每个模型（在训练集上评估）
# for name, classifier in models:
#     start_time = time.time()
#
#     # 训练模型
#     classifier.fit(X_train, y_train)
#
#     # 在训练集上进行预测（关键修改）
#     y_train_pred = classifier.predict(X_train)
#     y_train_pred_proba = classifier.predict_proba(X_train)[:, 1]  # 用于 ROC-AUC
#
#     # 计算 Accuracy（在训练集上）
#     accuracy = accuracy_score(y_train, y_train_pred)
#     accuracy_scores.append(accuracy)
#
#     # 计算 Precision（在训练集上）
#     precision = precision_score(y_train, y_train_pred)
#     precision_scores.append(precision)
#
#     # 计算 Recall（在训练集上）
#     recall = recall_score(y_train, y_train_pred)
#     recall_scores.append(recall)
#
#     # 计算 F1-Score（在训练集上）
#     f1 = f1_score(y_train, y_train_pred)
#     f1_scores.append(f1)
#
#     # 计算 ROC-AUC（在训练集上）
#     roc_auc = roc_auc_score(y_train, y_train_pred_proba)
#     roc_auc_scores.append(roc_auc)
#
#     # 计算模型执行时间
#     execution_time = time.time() - start_time
#     execution_times.append(execution_time)
#
#     # 存储模型名称
#     model_names.append(name)
#
# # 创建 DataFrame 以存储所有性能指标和执行时间
# results_df = pd.DataFrame({
#     'Model': model_names,
#     'Accuracy': accuracy_scores,
#     'Precision': precision_scores,
#     'Recall': recall_scores,
#     'F1-Score': f1_scores,
#     'ROC-AUC': roc_auc_scores,
#     'Execution Time (s)': execution_times
# })
#
# # 显示结果
# print("训练集上的性能评估结果：")
# results_df

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import joblib

# 统计断纱事件（y==1）的数量和占比
print("\n=== 断纱事件统计 ===")

# 训练集断纱事件统计
train_break_count = np.sum(y_train == 1)
train_break_ratio = train_break_count / len(y_train)

# 测试集断纱事件统计
test_break_count = np.sum(y_test == 1)
test_break_ratio = test_break_count / len(y_test)

# 总体断纱事件统计
total_break_count = np.sum(y == 1)
total_break_ratio = total_break_count / len(y)

print(f"训练集断纱事件数量: {train_break_count} (占比: {train_break_ratio:.2%})")
print(f"测试集断纱事件数量: {test_break_count} (占比: {test_break_ratio:.2%})")
print(f"总体断纱事件数量: {total_break_count} (占比: {total_break_ratio:.2%})")

# 检查类别分布是否均衡
print("\n=== 类别分布分析 ===")
print(f"训练集类别分布 - 正常事件: {len(y_train) - train_break_count}, 断纱事件: {train_break_count}")
print(f"测试集类别分布 - 正常事件: {len(y_test) - test_break_count}, 断纱事件: {test_break_count}")

# 如果类别不平衡严重，给出警告和处理建议
if train_break_ratio < 0.05 or train_break_ratio > 0.95:
    print("警告: 训练集类别不平衡严重，可能影响模型性能!")
    print("处理建议:")
    print("1. 使用类别权重 (class_weight='balanced')")
    print("2. 使用过采样技术 (如SMOTE)")
    print("3. 使用欠采样技术")
    print("4. 使用合适的评估指标 (如F1-score, ROC-AUC)")

if test_break_ratio < 0.05 or test_break_ratio > 0.95:
    print("警告: 测试集类别不平衡严重，评估结果可能不可靠!")
    print("处理建议:")
    print("1. 使用分层抽样确保测试集代表性")
    print("2. 使用合适的评估指标 (如F1-score, ROC-AUC)")

# 处理类别不平衡的解决方案
print("\n=== 类别不平衡处理方案 ===")

# 1. 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
print(f"类别权重: 正常事件={class_weights[0]:.2f}, 断纱事件={class_weights[1]:.2f}")

# 2. 显示过采样前后的数据分布
print(f"\n过采样前训练集分布: 正常事件={len(y_train) - train_break_count}, 断纱事件={train_break_count}")
print("应用SMOTE过采样后，断纱事件将与正常事件数量相等")

# 3. 修改模型定义，加入类别权重和采样策略
print("\n=== 修改模型定义以处理类别不平衡 ===")

# 计算精确的类别权重字典
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"使用的类别权重字典: {class_weight_dict}")

# 支持类别权重的模型 - 使用计算出的精确权重
models_balanced = [
    ('RF_balanced', RandomForestClassifier(
        class_weight=class_weight_dict,  # 使用计算出的精确权重
        random_state=17
    )),
    ('XGBoost_balanced', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=(len(y_train) - train_break_count) / train_break_count  # 设置正例权重
    )),
    ('LightGBM_balanced', LGBMClassifier(
        class_weight=class_weight_dict,  # 使用计算出的精确权重
        random_state=17
    )),
]

# 创建采样器
smote = SMOTE(random_state=17)
under_sampler = RandomUnderSampler(random_state=17)

# 使用采样器的模型管道
models_sampled = [
    ('RF_sampled', Pipeline([
        ('smote', smote),
        ('classifier', RandomForestClassifier(random_state=17))
    ])),
    ('XGBoost_sampled', Pipeline([
        ('smote', smote),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ])),
    ('LightGBM_sampled', Pipeline([
        ('smote', smote),
        ('classifier', LGBMClassifier(random_state=17))
    ])),
]

print("已创建处理类别不平衡的模型:")
print("1. 使用类别权重的模型: RF_balanced, XGBoost_balanced, LightGBM_balanced")
print("2. 使用SMOTE过采样的模型: RF_sampled, XGBoost_sampled, LightGBM_sampled")

# 评估指标建议
print("\n=== 评估指标建议 ===")
print("对于不平衡数据，建议重点关注以下指标:")
print("1. F1-Score: 精确率和召回率的调和平均")
print("2. ROC-AUC: 不受类别分布影响")
print("3. Precision-Recall曲线下面积: 更适合不平衡数据")
print("4. 召回率(Recall): 确保尽可能多地识别断纱事件")

# 最终建议
print("\n=== 最终建议 ===")
print("1. 优先使用带类别权重的模型或SMOTE过采样模型")
print("2. 重点关注F1-Score和ROC-AUC指标")
print("3. 考虑使用Precision-Recall曲线评估模型性能")
print("4. 在实际应用中，可能需要根据业务需求调整分类阈值")

# 保存更新后的RF_balanced模型和相关信息
print("\n=== 保存更新后的模型和数据 ===")

# 提取RF_balanced模型
rf_balanced_model = None
for name, model in models_balanced:
    if name == 'RF_balanced':
        rf_balanced_model = model
        break

if rf_balanced_model is not None:
    # 训练模型
    print("训练RF_balanced模型...")
    rf_balanced_model.fit(X_train, y_train)
    print("RF_balanced模型训练完成")

    # 创建模型信息字典
    model_info = {
        'model': rf_balanced_model,
        'class_weights': class_weight_dict,
        'train_break_count': train_break_count,
        'train_break_ratio': train_break_ratio,
        'test_break_count': test_break_count,
        'test_break_ratio': test_break_ratio,
        'total_break_count': total_break_count,
        'total_break_ratio': total_break_ratio,
        'train_size': len(X_train),
        'test_size': len(X_test),
        'feature_names': X_train.columns.tolist() if hasattr(X_train, 'columns') else [f'Feature_{i}' for i in range(X_train.shape[1])]
    }

    # 保存模型和信息
    joblib.dump(model_info, 'rf_balanced_updated.pkl')
    print("✅ RF_balanced模型和相关信息已保存至: rf_balanced_updated.pkl")

    # 保存类别分布信息到CSV
    distribution_info = pd.DataFrame({
        'Dataset': ['训练集', '测试集', '总体'],
        '正常事件数量': [
            len(y_train) - train_break_count,
            len(y_test) - test_break_count,
            len(y) - total_break_count
        ],
        '断纱事件数量': [train_break_count, test_break_count, total_break_count],
        '断纱事件占比': [train_break_ratio, test_break_ratio, total_break_ratio]
    })

    distribution_info.to_csv('class_distribution_info.csv', index=False)
    print("✅ 类别分布信息已保存至: class_distribution_info.csv")

    # 保存模型配置信息
    config_info = {
        'class_weight_dict': class_weight_dict,
        'models_balanced_names': [name for name, _ in models_balanced],
        'models_sampled_names': [name for name, _ in models_sampled],
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    config_df = pd.DataFrame([config_info])
    config_df.to_csv('model_configuration.csv', index=False)
    print("✅ 模型配置信息已保存至: model_configuration.csv")

    print(f"\n📊 模型详细信息:")
    print(f"   类别权重: {class_weight_dict}")
    print(f"   训练集大小: {len(X_train)}")
    print(f"   测试集大小: {len(X_test)}")
    print(f"   特征数量: {X_train.shape[1]}")
    print(f"   RF_balanced模型参数: {rf_balanced_model.get_params()}")

else:
    print("❌ 错误: 未找到RF_balanced模型")

print("\n🎯 所有更新已完成！您现在可以调用保存的RF_balanced模型进行后续分析。")

# 提供后续调用示例
print("\n=== 后续调用示例 ===")
print("""
# 加载保存的模型和信息
import joblib
model_info = joblib.load('rf_balanced_updated.pkl')

# 获取模型
rf_balanced = model_info['model']

# 获取类别权重
class_weights = model_info['class_weights']

# 进行预测
y_pred = rf_balanced.predict(X_new)
y_pred_proba = rf_balanced.predict_proba(X_new)

print("模型加载成功，可以进行预测！")
""")

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib

print("=== 更新后RF_balanced模型性能评估 ===")
print("使用优化类别权重处理不平衡数据")
print("=" * 50)

# 初始化存储结果的列表
metrics_results = []
execution_times = []

try:
    # 加载之前保存的更新后RF_balanced模型
    print("加载已保存的RF_balanced模型...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # 获取模型和相关信息
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']

    print(f"模型加载完成，耗时: {load_time:.2f}秒")
    print(f"使用的类别权重: {class_weights}")
    print(f"训练集断纱事件: {train_break_count} (占比: {train_break_ratio:.2%})")

    # 由于模型已经训练过，直接进行预测
    print("\n开始模型预测...")
    predict_start_time = time.time()

    # 在训练集和测试集上进行预测
    y_train_pred = rf_balanced_model.predict(X_train)
    y_test_pred = rf_balanced_model.predict(X_test)

    # 获取概率预测
    y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"预测完成，耗时: {predict_time:.2f}秒")

    # 计算训练集指标
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, zero_division=0)
    train_recall = recall_score(y_train, y_train_pred, zero_division=0)
    train_f1 = f1_score(y_train, y_train_pred, zero_division=0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

    # 计算测试集指标
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)
    test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # 计算混淆矩阵
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    total_time = load_time + predict_time

    # 存储结果
    metrics_results.append({
        'Dataset': '训练集',
        'Accuracy': train_accuracy,
        'Precision': train_precision,
        'Recall': train_recall,
        'F1-Score': train_f1,
        'ROC-AUC': train_roc_auc
    })

    metrics_results.append({
        'Dataset': '测试集',
        'Accuracy': test_accuracy,
        'Precision': test_precision,
        'Recall': test_recall,
        'F1-Score': test_f1,
        'ROC-AUC': test_roc_auc
    })

    execution_times.append({
        '模型加载时间': load_time,
        '预测时间': predict_time,
        '总时间': total_time
    })

    # 打印结果
    print("\n更新后RF_balanced模型评估结果:")
    print("训练集性能:")
    print(f"  准确率: {train_accuracy:.4f}")
    print(f"  精确率: {train_precision:.4f}")
    print(f"  召回率: {train_recall:.4f}")
    print(f"  F1分数: {train_f1:.4f}")
    print(f"  ROC-AUC: {train_roc_auc:.4f}")

    print("\n测试集性能:")
    print(f"  准确率: {test_accuracy:.4f}")
    print(f"  精确率: {test_precision:.4f}")
    print(f"  召回率: {test_recall:.4f}")
    print(f"  F1分数: {test_f1:.4f}")
    print(f"  ROC-AUC: {test_roc_auc:.4f}")

    print("\n混淆矩阵:")
    print("训练集混淆矩阵:")
    print(cm_train)
    print("测试集混淆矩阵:")
    print(cm_test)

    print("\n详细分类报告 - 测试集:")
    print(classification_report(y_test, y_test_pred, target_names=['正常', '断纱']))

    print(f"\n时间统计:")
    print(f"  模型加载时间: {load_time:.2f}秒")
    print(f"  预测时间: {predict_time:.2f}秒")
    print(f"  总执行时间: {total_time:.2f}秒")

    # 创建结果DataFrame
    results_df = pd.DataFrame(metrics_results)
    print("\n" + "=" * 80)
    print("更新后RF_balanced模型性能总结")
    print("=" * 80)
    print(results_df.round(4))

    # 创建时间统计DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\n时间统计:")
    print(time_df.round(2))

    # 保存结果到CSV文件
    results_df.to_csv('RF_balanced_updated_performance.csv', index=False)
    time_df.to_csv('RF_balanced_updated_timing.csv', index=False)
    print(f"\n结果已保存至: RF_balanced_updated_performance.csv 和 RF_balanced_updated_timing.csv")

    # 特征重要性分析
    print("\n=== 特征重要性分析 ===")
    feature_importances = rf_balanced_model.feature_importances_
    feature_names = model_info['feature_names']

    # 创建特征重要性DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)

    print("前10个最重要的特征:")
    print(importance_df.head(10).round(4))

    # 保存特征重要性
    importance_df.to_csv('RF_balanced_feature_importance.csv', index=False)
    print(f"特征重要性已保存至: RF_balanced_feature_importance.csv")

except FileNotFoundError:
    print("错误: 未找到保存的RF_balanced模型文件 'rf_balanced_updated.pkl'")
    print("请先运行模型创建和保存代码")
except Exception as e:
    print(f"模型加载或评估过程中出错: {e}")

print("\n评估完成！")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== Updated RF_balanced Model Performance Evaluation ===")
print("Using Optimized Class Weights for Imbalanced Data")
print("=" * 50)

# Initialize lists to store results
metrics_results = []
execution_times = []

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']

    print(f"Model loaded successfully, time: {load_time:.2f} seconds")
    print(f"Class weights used: {class_weights}")
    print(f"Training set yarn break events: {train_break_count} (ratio: {train_break_ratio:.2%})")

    # Since the model is already trained, proceed directly to prediction
    print("\nStarting model prediction...")
    predict_start_time = time.time()

    # Make predictions on training and test sets
    y_train_pred = rf_balanced_model.predict(X_train)
    y_test_pred = rf_balanced_model.predict(X_test)

    # Get probability predictions
    y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"Prediction completed, time: {predict_time:.2f} seconds")

    # Calculate training set metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, zero_division=0)
    train_recall = recall_score(y_train, y_train_pred, zero_division=0)
    train_f1 = f1_score(y_train, y_train_pred, zero_division=0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

    # Calculate test set metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)
    test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # Calculate confusion matrices
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    total_time = load_time + predict_time

    # 6. Visualize confusion matrices with enhanced styling
    print("\n=== Generating Confusion Matrix Visualizations ===")

    # Create training set confusion matrix heatmap with enhanced styling
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 16, 'weight': 'bold'},  # Enlarge and bold annotation font
                linewidths=2, linecolor='black')  # Bold cell borders

    plt.title('Training Set Confusion Matrix - RF_balanced Model',
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')

    # Bold the entire plot border
    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_linewidth(3)

    plt.tight_layout()
    plt.savefig('RF_balanced_training_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Training set confusion matrix saved to: RF_balanced_training_confusion_matrix.png")
    plt.show()

    # Create test set confusion matrix heatmap with enhanced styling
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 16, 'weight': 'bold'},  # Enlarge and bold annotation font
                linewidths=2, linecolor='black')  # Bold cell borders

    plt.title('Test Set Confusion Matrix - RF_balanced Model',
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')

    # Bold the entire plot border
    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_linewidth(3)

    plt.tight_layout()
    plt.savefig('RF_balanced_test_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Test set confusion matrix saved to: RF_balanced_test_confusion_matrix.png")
    plt.show()

    # Store results
    metrics_results.append({
        'Dataset': 'Training Set',
        'Accuracy': train_accuracy,
        'Precision': train_precision,
        'Recall': train_recall,
        'F1-Score': train_f1,
        'ROC-AUC': train_roc_auc
    })

    metrics_results.append({
        'Dataset': 'Test Set',
        'Accuracy': test_accuracy,
        'Precision': test_precision,
        'Recall': test_recall,
        'F1-Score': test_f1,
        'ROC-AUC': test_roc_auc
    })

    execution_times.append({
        'Model Loading Time': load_time,
        'Prediction Time': predict_time,
        'Total Time': total_time
    })

    # Print results
    print("\nUpdated RF_balanced Model Evaluation Results:")
    print("Training Set Performance:")
    print(f"  Accuracy: {train_accuracy:.4f}")
    print(f"  Precision: {train_precision:.4f}")
    print(f"  Recall: {train_recall:.4f}")
    print(f"  F1-Score: {train_f1:.4f}")
    print(f"  ROC-AUC: {train_roc_auc:.4f}")

    print("\nTest Set Performance:")
    print(f"  Accuracy: {test_accuracy:.4f}")
    print(f"  Precision: {test_precision:.4f}")
    print(f"  Recall: {test_recall:.4f}")
    print(f"  F1-Score: {test_f1:.4f}")
    print(f"  ROC-AUC: {test_roc_auc:.4f}")

    print("\nConfusion Matrices:")
    print("Training Set Confusion Matrix:")
    print(cm_train)
    print("Test Set Confusion Matrix:")
    print(cm_test)

    print("\nDetailed Classification Report - Test Set:")
    print(classification_report(y_test, y_test_pred, target_names=['Normal', 'Yarn Break']))

    print(f"\nTime Statistics:")
    print(f"  Model Loading Time: {load_time:.2f} seconds")
    print(f"  Prediction Time: {predict_time:.2f} seconds")
    print(f"  Total Execution Time: {total_time:.2f} seconds")

    # Create results DataFrame
    results_df = pd.DataFrame(metrics_results)
    print("\n" + "=" * 80)
    print("Updated RF_balanced Model Performance Summary")
    print("=" * 80)
    print(results_df.round(4))

    # Create time statistics DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\nTime Statistics:")
    print(time_df.round(2))

    # Save results to CSV files
    results_df.to_csv('RF_balanced_updated_performance.csv', index=False)
    time_df.to_csv('RF_balanced_updated_timing.csv', index=False)
    print(f"\nResults saved to: RF_balanced_updated_performance.csv and RF_balanced_updated_timing.csv")

    # Feature importance analysis
    print("\n=== Feature Importance Analysis ===")
    feature_importances = rf_balanced_model.feature_importances_
    feature_names = model_info['feature_names']

    # Create feature importance DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(importance_df.head(10).round(4))

    # Save feature importance
    importance_df.to_csv('RF_balanced_feature_importance.csv', index=False)
    print(f"Feature importance saved to: RF_balanced_feature_importance.csv")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")
except Exception as e:
    print(f"Error during model loading or evaluation: {e}")

print("\nEvaluation completed!")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== Updated RF_balanced Model Performance Evaluation ===")
print("Using Optimized Class Weights for Imbalanced Data")
print("=" * 50)

# Initialize lists to store results
metrics_results = []
execution_times = []

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']

    print(f"Model loaded successfully, time: {load_time:.2f} seconds")
    print(f"Class weights used: {class_weights}")
    print(f"Training set yarn break events: {train_break_count} (ratio: {train_break_ratio:.2%})")

    # Since the model is already trained, proceed directly to prediction
    print("\nStarting model prediction...")
    predict_start_time = time.time()

    # Make predictions on training and test sets
    y_train_pred = rf_balanced_model.predict(X_train)
    y_test_pred = rf_balanced_model.predict(X_test)

    # Get probability predictions
    y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"Prediction completed, time: {predict_time:.2f} seconds")

    # Calculate training set metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, zero_division=0)
    train_recall = recall_score(y_train, y_train_pred, zero_division=0)
    train_f1 = f1_score(y_train, y_train_pred, zero_division=0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

    # Calculate test set metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)
    test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # Calculate confusion matrices
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    total_time = load_time + predict_time

    # Store results
    metrics_results.append({
        'Dataset': 'Training Set',
        'Accuracy': train_accuracy,
        'Precision': train_precision,
        'Recall': train_recall,
        'F1-Score': train_f1,
        'ROC-AUC': train_roc_auc
    })

    metrics_results.append({
        'Dataset': 'Test Set',
        'Accuracy': test_accuracy,
        'Precision': test_precision,
        'Recall': test_recall,
        'F1-Score': test_f1,
        'ROC-AUC': test_roc_auc
    })

    execution_times.append({
        'Model Loading Time': load_time,
        'Prediction Time': predict_time,
        'Total Time': total_time
    })

    # Print results
    print("\nUpdated RF_balanced Model Evaluation Results:")
    print("Training Set Performance:")
    print(f"  Accuracy: {train_accuracy:.4f}")
    print(f"  Precision: {train_precision:.4f}")
    print(f"  Recall: {train_recall:.4f}")
    print(f"  F1-Score: {train_f1:.4f}")
    print(f"  ROC-AUC: {train_roc_auc:.4f}")

    print("\nTest Set Performance:")
    print(f"  Accuracy: {test_accuracy:.4f}")
    print(f"  Precision: {test_precision:.4f}")
    print(f"  Recall: {test_recall:.4f}")
    print(f"  F1-Score: {test_f1:.4f}")
    print(f"  ROC-AUC: {test_roc_auc:.4f}")

    print("\nConfusion Matrices:")
    print("Training Set Confusion Matrix:")
    print(cm_train)
    print("Test Set Confusion Matrix:")
    print(cm_test)

    print("\nDetailed Classification Report - Test Set:")
    print(classification_report(y_test, y_test_pred, target_names=['Normal', 'Yarn Break']))

    print(f"\nTime Statistics:")
    print(f"  Model Loading Time: {load_time:.2f} seconds")
    print(f"  Prediction Time: {predict_time:.2f} seconds")
    print(f"  Total Execution Time: {total_time:.2f} seconds")

    # Create results DataFrame
    results_df = pd.DataFrame(metrics_results)
    print("\n" + "=" * 80)
    print("Updated RF_balanced Model Performance Summary")
    print("=" * 80)
    print(results_df.round(4))

    # Create time statistics DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\nTime Statistics:")
    print(time_df.round(2))

    # Save results to CSV files
    results_df.to_csv('RF_balanced_updated_performance.csv', index=False)
    time_df.to_csv('RF_balanced_updated_timing.csv', index=False)
    print(f"\nResults saved to: RF_balanced_updated_performance.csv and RF_balanced_updated_timing.csv")

    # 6. Visualize confusion matrices with enhanced styling - 按照参考代码格式修改
    print("\n=== Generating Confusion Matrix Visualizations ===")

    # Visualization - Confusion Matrix in one figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('RF_balanced Model Performance - Confusion Matrices', fontsize=16, fontweight='bold')

    # Training set confusion matrix with enhanced styling
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', ax=axes[0],
                annot_kws={'size': 16, 'weight': 'bold'},  # Enlarge and bold annotation font
                linewidths=2, linecolor='black',  # Bold cell borders
                cbar_kws={'shrink': 0.8})
    axes[0].set_title('Training Set - RF_balanced Model', fontsize=16, fontweight='bold', pad=20)
    axes[0].set_xlabel('Predicted Label', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('True Label', fontsize=14, fontweight='bold')
    axes[0].tick_params(axis='both', which='major', labelsize=13)

    # Bold the entire plot border
    for spine in axes[0].spines.values():
        spine.set_linewidth(3)

    # Test set confusion matrix with enhanced styling
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', ax=axes[1],
                annot_kws={'size': 16, 'weight': 'bold'},  # Enlarge and bold annotation font
                linewidths=2, linecolor='black',  # Bold cell borders
                cbar_kws={'shrink': 0.8})
    axes[1].set_title('Test Set - RF_balanced Model', fontsize=16, fontweight='bold', pad=20)
    axes[1].set_xlabel('Predicted Label', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('True Label', fontsize=14, fontweight='bold')
    axes[1].tick_params(axis='both', which='major', labelsize=13)

    # Bold the entire plot border
    for spine in axes[1].spines.values():
        spine.set_linewidth(3)

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig('RF_balanced_confusion_matrices.png', dpi=300, bbox_inches='tight')
    print("Confusion matrices saved to: RF_balanced_confusion_matrices.png")
    plt.show()

    # Probability distribution visualization
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    break_proba = y_test_pred_proba[y_test == 1]
    normal_proba = y_test_pred_proba[y_test == 0]

    plt.hist(normal_proba, bins=50, alpha=0.7, label='Normal Events', color='green', edgecolor='black')
    plt.hist(break_proba, bins=20, alpha=0.7, label='Yarn Break Events', color='red', edgecolor='black')
    plt.axvline(x=0.5, color='orange', linestyle='--', linewidth=2, label='Default Threshold(0.5)')
    plt.xlabel('Predicted Probability', fontweight='bold')
    plt.ylabel('Frequency', fontweight='bold')
    plt.title('Test Set Predicted Probability Distribution', fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    # ROC curve
    from sklearn.metrics import roc_curve

    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {test_roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlabel('False Positive Rate', fontweight='bold')
    plt.ylabel('True Positive Rate', fontweight='bold')
    plt.title('ROC Curve', fontweight='bold')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('RF_balanced_probability_roc.png', dpi=300, bbox_inches='tight')
    print("Probability distribution and ROC curve saved to: RF_balanced_probability_roc.png")
    plt.show()

    # Feature importance analysis
    print("\n=== Feature Importance Analysis ===")
    feature_importances = rf_balanced_model.feature_importances_
    feature_names = model_info['feature_names']

    # Create feature importance DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(importance_df.head(10).round(4))

    # Save feature importance
    importance_df.to_csv('RF_balanced_feature_importance.csv', index=False)
    print(f"Feature importance saved to: RF_balanced_feature_importance.csv")

    # Final conclusions
    print("\n" + "=" * 80)
    print("🎯 Final Conclusions and Recommendations")
    print("=" * 80)
    print("✅ Model Performance Summary:")
    print(f"   - ROC-AUC: {test_roc_auc:.3f}")
    print(f"   - F1-Score: {test_f1:.4f}")
    print(f"   - Precision: {test_precision:.4f}")
    print(f"   - Recall: {test_recall:.4f}")
    print(f"   - Accuracy: {test_accuracy:.4f}")

    print("\n📋 Production Environment Recommendations:")
    print("1. Model is ready for deployment with optimized class weights")
    print("2. Monitor model performance and data drift in production")
    print("3. Consider periodic retraining with new data")
    print("4. Use default threshold (0.5) for classification")

    print(f"\n💡 Technical Notes:")
    print("- Model shows good discrimination ability with ROC-AUC performance")
    print("- Performance metrics indicate balanced performance across classes")
    print("- Confusion matrix shows model's prediction pattern")
    print("- Optimized class weights help handle imbalanced data")

    # Save model information
    updated_model_info = {
        'model': rf_balanced_model,
        'class_weights': class_weights,
        'performance': {
            'test_accuracy': test_accuracy,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_f1': test_f1,
            'test_roc_auc': test_roc_auc
        }
    }

    joblib.dump(updated_model_info, 'rf_balanced_final.pkl')
    print(f"\n💾 Model information saved to: rf_balanced_final.pkl")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")
except Exception as e:
    print(f"Error during model loading or evaluation: {e}")

print("\nEvaluation completed!")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== Updated RF_balanced Model Performance Evaluation ===")
print("Using Optimized Class Weights for Imbalanced Data")
print("=" * 60)

# 初始化存储结果的列表
metrics_results = []
execution_times = []

def plot_confusion_matrix(cm, dataset_name, filename):
    """绘制混淆矩阵热力图"""
    plt.figure(figsize=(8, 6))

    # 创建热力图
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                cbar=True, annot_kws={"size": 16, "weight": "bold"},
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'])

    # 设置标题和标签
    plt.title(f'Confusion Matrix - {dataset_name}\nRF_balanced Model',
              fontsize=18, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=17, fontweight='bold')
    plt.ylabel('True Label', fontsize=17, fontweight='bold')

    # 设置坐标轴标签样式
    plt.xticks(rotation=0, fontsize=15, fontweight='bold')
    plt.yticks(rotation=0, fontsize=15, fontweight='bold')

    # 调整布局并保存
    plt.tight_layout()
    plt.savefig(filename, dpi=1200, bbox_inches='tight',
                facecolor='white', edgecolor='none')
    plt.close()
    print(f"Confusion matrix saved to: {filename}")

try:
    # 加载之前保存的更新后RF_balanced模型
    print("Loading saved RF_balanced model...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # 获取模型和相关信息
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']

    print(f"Model loading completed, time: {load_time:.2f} seconds")
    print(f"Class weights used: {class_weights}")
    print(f"Training set yarn break events: {train_break_count} (ratio: {train_break_ratio:.2%})")

    # 由于模型已经训练过，直接进行预测
    print("\nStarting model prediction...")
    predict_start_time = time.time()

    # 在训练集和测试集上进行预测
    y_train_pred = rf_balanced_model.predict(X_train)
    y_test_pred = rf_balanced_model.predict(X_test)

    # 获取概率预测
    y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"Prediction completed, time: {predict_time:.2f} seconds")

    # 计算训练集指标
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, zero_division=0)
    train_recall = recall_score(y_train, y_train_pred, zero_division=0)
    train_f1 = f1_score(y_train, y_train_pred, zero_division=0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

    # 计算测试集指标
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)
    test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # 计算混淆矩阵
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    total_time = load_time + predict_time

    # 存储结果
    metrics_results.append({
        'Dataset': 'Training Set',
        'Accuracy': train_accuracy,
        'Precision': train_precision,
        'Recall': train_recall,
        'F1-Score': train_f1,
        'ROC-AUC': train_roc_auc
    })

    metrics_results.append({
        'Dataset': 'Test Set',
        'Accuracy': test_accuracy,
        'Precision': test_precision,
        'Recall': test_recall,
        'F1-Score': test_f1,
        'ROC-AUC': test_roc_auc
    })

    execution_times.append({
        'Model Loading Time': load_time,
        'Prediction Time': predict_time,
        'Total Time': total_time
    })

    # 打印结果
    print("\nUpdated RF_balanced Model Evaluation Results:")
    print("Training Set Performance:")
    print(f"  Accuracy: {train_accuracy:.4f}")
    print(f"  Precision: {train_precision:.4f}")
    print(f"  Recall: {train_recall:.4f}")
    print(f"  F1-Score: {train_f1:.4f}")
    print(f"  ROC-AUC: {train_roc_auc:.4f}")

    print("\nTest Set Performance:")
    print(f"  Accuracy: {test_accuracy:.4f}")
    print(f"  Precision: {test_precision:.4f}")
    print(f"  Recall: {test_recall:.4f}")
    print(f"  F1-Score: {test_f1:.4f}")
    print(f"  ROC-AUC: {test_roc_auc:.4f}")

    print("\nConfusion Matrices:")
    print("Training Set Confusion Matrix:")
    print(cm_train)
    print("Test Set Confusion Matrix:")
    print(cm_test)

    print("\nDetailed Classification Report - Test Set:")
    print(classification_report(y_test, y_test_pred, target_names=['Normal', 'Yarn Break']))

    # 生成混淆矩阵可视化
    print("\n=== Generating Confusion Matrix Visualizations ===")
    plot_confusion_matrix(cm_train, "Training Set", "training_confusion_matrix.png")
    plot_confusion_matrix(cm_test, "Test Set", "test_confusion_matrix.png")

    print(f"\nTime Statistics:")
    print(f"  Model Loading Time: {load_time:.2f} seconds")
    print(f"  Prediction Time: {predict_time:.2f} seconds")
    print(f"  Total Execution Time: {total_time:.2f} seconds")

    # 创建结果DataFrame
    results_df = pd.DataFrame(metrics_results)
    print("\n" + "=" * 80)
    print("Updated RF_balanced Model Performance Summary")
    print("=" * 80)
    print(results_df.round(4))

    # 创建时间统计DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\nTime Statistics:")
    print(time_df.round(2))

    # 保存结果到CSV文件
    results_df.to_csv('RF_balanced_updated_performance.csv', index=False)
    time_df.to_csv('RF_balanced_updated_timing.csv', index=False)
    print(f"\nResults saved to: RF_balanced_updated_performance.csv and RF_balanced_updated_timing.csv")

    # 特征重要性分析
    print("\n=== Feature Importance Analysis ===")
    feature_importances = rf_balanced_model.feature_importances_
    feature_names = model_info['feature_names']

    # 创建特征重要性DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(importance_df.head(10).round(4))

    # 保存特征重要性
    importance_df.to_csv('RF_balanced_feature_importance.csv', index=False)
    print(f"Feature importance saved to: RF_balanced_feature_importance.csv")

    # 生成特征重要性可视化
    print("\n=== Generating Feature Importance Visualization ===")
    plt.figure(figsize=(12, 8))

    # 获取前15个最重要的特征
    top_features = importance_df.head(15)

    # 创建水平条形图
    plt.barh(range(len(top_features)), top_features['Importance'],
             color='steelblue', alpha=0.8, edgecolor='black', linewidth=1.2)

    # 设置标签和标题
    plt.yticks(range(len(top_features)), top_features['Feature'], fontsize=14, fontweight='bold')
    plt.xlabel('Feature Importance', fontsize=17, fontweight='bold')
    plt.title('Top 15 Most Important Features\nRF_balanced Model',
              fontsize=18, fontweight='bold', pad=20)

    # 添加数值标签
    for i, v in enumerate(top_features['Importance']):
        plt.text(v + 0.001, i, f'{v:.3f}', fontsize=12, fontweight='bold',
                va='center', ha='left')

    # 调整布局
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.savefig('feature_importance_visualization.png', dpi=1200,
                bbox_inches='tight', facecolor='white', edgecolor='none')
    plt.close()
    print("Feature importance visualization saved to: feature_importance_visualization.png")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")
except Exception as e:
    print(f"Error during model loading or evaluation: {e}")
    import traceback
    traceback.print_exc()

print("\nEvaluation completed!")

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== RF_balanced Model Evaluation (Default Threshold) ===")
print("=" * 50)

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    model_info = joblib.load('rf_balanced_updated.pkl')

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']

    print(f"Model loaded successfully")
    print(f"Class weights used: {class_weights}")

    # Get probability predictions
    y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

    # Default threshold (0.5) results
    y_train_pred_default = (y_train_pred_proba >= 0.5).astype(int)
    y_test_pred_default = (y_test_pred_proba >= 0.5).astype(int)

    # Calculate metrics with default threshold
    train_accuracy_def = accuracy_score(y_train, y_train_pred_default)
    train_precision_def = precision_score(y_train, y_train_pred_default, zero_division=0)
    train_recall_def = recall_score(y_train, y_train_pred_default, zero_division=0)
    train_f1_def = f1_score(y_train, y_train_pred_default, zero_division=0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

    test_accuracy_def = accuracy_score(y_test, y_test_pred_default)
    test_precision_def = precision_score(y_test, y_test_pred_default, zero_division=0)
    test_recall_def = recall_score(y_test, y_test_pred_default, zero_division=0)
    test_f1_def = f1_score(y_test, y_test_pred_default, zero_division=0)
    test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # Create results table
    results_comparison = []

    # Add results to table
    results_comparison.append({
        'Threshold': 'Default (0.5)',
        'Dataset': 'Training Set',
        'Accuracy': train_accuracy_def,
        'Precision': train_precision_def,
        'Recall': train_recall_def,
        'F1-Score': train_f1_def,
        'ROC-AUC': train_roc_auc
    })

    results_comparison.append({
        'Threshold': 'Default (0.5)',
        'Dataset': 'Test Set',
        'Accuracy': test_accuracy_def,
        'Precision': test_precision_def,
        'Recall': test_recall_def,
        'F1-Score': test_f1_def,
        'ROC-AUC': test_roc_auc
    })

    results_df = pd.DataFrame(results_comparison)

    # Print results
    print("\n" + "=" * 80)
    print("Model Performance with Default Threshold (0.5)")
    print("=" * 80)
    print(results_df.round(4))

    # Confusion matrix - Only Default Threshold
    cm_train_default = confusion_matrix(y_train, y_train_pred_default)
    cm_test_default = confusion_matrix(y_test, y_test_pred_default)

    print(f"\nDefault Threshold(0.5) - Training Set Confusion Matrix:")
    print(cm_train_default)
    print(f"Default Threshold(0.5) - Test Set Confusion Matrix:")
    print(cm_test_default)

    # Detailed classification report - Only Default Threshold
    print(f"\nDefault Threshold(0.5) - Test Set Detailed Classification Report:")
    print(classification_report(y_test, y_test_pred_default, target_names=['Normal', 'Yarn Break']))

    # Visualization - Only Default Threshold Confusion Matrix
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('Model Performance with Default Threshold (0.5)', fontsize=16, fontweight='bold')

    # Training set confusion matrix with enhanced styling
    sns.heatmap(cm_train_default, annot=True, fmt='d', cmap='Blues', ax=axes[0],
                annot_kws={'size': 16, 'weight': 'bold'},  # 增大注释字体并加粗
                linewidths=2, linecolor='black',  # 加粗单元格边框
                cbar_kws={'shrink': 0.8})
    axes[0].set_title('Training Set - Default Threshold(0.5)', fontsize=16, fontweight='bold', pad=20)
    axes[0].set_xlabel('Predicted Label', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('True Label', fontsize=14, fontweight='bold')
    axes[0].tick_params(axis='both', which='major', labelsize=13)

    # 加粗整个图的边框
    for spine in axes[0].spines.values():
        spine.set_linewidth(3)

    # Test set confusion matrix with enhanced styling  混肴矩阵设置 /////////////////////
    sns.heatmap(cm_test_default, annot=True, fmt='d', cmap='Blues', ax=axes[1],
                annot_kws={'size': 16, 'weight': 'bold'},  # 增大注释字体并加粗
                linewidths=2, linecolor='black',  # 加粗单元格边框
                cbar_kws={'shrink': 0.8})
    axes[1].set_title('Test Set - Default Threshold(0.5)', fontsize=16, fontweight='bold', pad=20)
    axes[1].set_xlabel('Predicted Label', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('True Label', fontsize=14, fontweight='bold')
    axes[1].tick_params(axis='both', which='major', labelsize=13)

    # 加粗整个图的边框
    for spine in axes[1].spines.values():
        spine.set_linewidth(3)

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

    # Probability distribution visualization
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    break_proba = y_test_pred_proba[y_test == 1]
    normal_proba = y_test_pred_proba[y_test == 0]

    plt.hist(normal_proba, bins=50, alpha=0.7, label='Normal Events', color='green', edgecolor='black')
    plt.hist(break_proba, bins=20, alpha=0.7, label='Yarn Break Events', color='red', edgecolor='black')
    plt.axvline(x=0.5, color='orange', linestyle='--', linewidth=2, label='Default Threshold(0.5)')
    plt.xlabel('Predicted Probability', fontweight='bold')
    plt.ylabel('Frequency', fontweight='bold')
    plt.title('Test Set Predicted Probability Distribution', fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    # ROC curve
    from sklearn.metrics import roc_curve

    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {test_roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlabel('False Positive Rate', fontweight='bold')
    plt.ylabel('True Positive Rate', fontweight='bold')
    plt.title('ROC Curve', fontweight='bold')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Final conclusions
    print("\n" + "=" * 80)
    print("🎯 Final Conclusions and Recommendations")
    print("=" * 80)
    print("✅ Model Performance Summary:")
    print(f"   - ROC-AUC: {test_roc_auc:.3f}")
    print(f"   - F1-Score: {test_f1_def:.4f}")
    print(f"   - Precision: {test_precision_def:.4f}")
    print(f"   - Recall: {test_recall_def:.4f}")
    print(f"   - Accuracy: {test_accuracy_def:.4f}")

    print("\n📋 Production Environment Recommendations:")
    print("1. Model is ready for deployment with default threshold (0.5)")
    print("2. Monitor model performance and data drift in production")
    print("3. Consider periodic retraining with new data")

    print(f"\n💡 Technical Notes:")
    print("- Model shows good discrimination ability with ROC-AUC performance")
    print("- Performance metrics indicate balanced performance across classes")
    print("- Confusion matrix shows model's prediction pattern")

    # Save model information
    model_info = {
        'model': rf_balanced_model,
        'class_weights': class_weights,
        'performance': {
            'test_accuracy': test_accuracy_def,
            'test_precision': test_precision_def,
            'test_recall': test_recall_def,
            'test_f1': test_f1_def,
            'test_roc_auc': test_roc_auc
        }
    }

    joblib.dump(model_info, 'rf_balanced_default.pkl')
    print(f"\n💾 Model information saved to: rf_balanced_default.pkl")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")

    # If saved model is not found, fall back to original code
    print("Attempting to use model from original models_balanced...")

    # Extract RF_balanced model directly from models_balanced
    rf_balanced_model = None
    for name, model in models_balanced:
        if name == 'RF_balanced':
            rf_balanced_model = model
            break

    if rf_balanced_model is None:
        print("Error: RF_balanced model not found")
    else:
        try:
            # Train model (if not already trained)
            rf_balanced_model.fit(X_train, y_train)

            # Get probability predictions
            y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
            y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

            # Default threshold (0.5) results
            y_train_pred_default = (y_train_pred_proba >= 0.5).astype(int)
            y_test_pred_default = (y_test_pred_proba >= 0.5).astype(int)

            # Calculate metrics with default threshold
            train_accuracy_def = accuracy_score(y_train, y_train_pred_default)
            train_precision_def = precision_score(y_train, y_train_pred_default, zero_division=0)
            train_recall_def = recall_score(y_train, y_train_pred_default, zero_division=0)
            train_f1_def = f1_score(y_train, y_train_pred_default, zero_division=0)
            train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

            test_accuracy_def = accuracy_score(y_test, y_test_pred_default)
            test_precision_def = precision_score(y_test, y_test_pred_default, zero_division=0)
            test_recall_def = recall_score(y_test, y_test_pred_default, zero_division=0)
            test_f1_def = f1_score(y_test, y_test_pred_default, zero_division=0)
            test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

            # Create results table
            results_comparison = []

            # Add results to table
            results_comparison.append({
                'Threshold': 'Default (0.5)',
                'Dataset': 'Training Set',
                'Accuracy': train_accuracy_def,
                'Precision': train_precision_def,
                'Recall': train_recall_def,
                'F1-Score': train_f1_def,
                'ROC-AUC': train_roc_auc
            })

            results_comparison.append({
                'Threshold': 'Default (0.5)',
                'Dataset': 'Test Set',
                'Accuracy': test_accuracy_def,
                'Precision': test_precision_def,
                'Recall': test_recall_def,
                'F1-Score': test_f1_def,
                'ROC-AUC': test_roc_auc
            })

            results_df = pd.DataFrame(results_comparison)

            # Print results
            print("\n" + "=" * 80)
            print("Model Performance with Default Threshold (0.5)")
            print("=" * 80)
            print(results_df.round(4))

            # Confusion matrix - Only Default Threshold
            cm_train_default = confusion_matrix(y_train, y_train_pred_default)
            cm_test_default = confusion_matrix(y_test, y_test_pred_default)

            print(f"\nDefault Threshold(0.5) - Training Set Confusion Matrix:")
            print(cm_train_default)
            print(f"Default Threshold(0.5) - Test Set Confusion Matrix:")
            print(cm_test_default)

            # Detailed classification report - Only Default Threshold
            print(f"\nDefault Threshold(0.5) - Test Set Detailed Classification Report:")
            print(classification_report(y_test, y_test_pred_default, target_names=['Normal', 'Yarn Break']))

            # Visualization - Only Default Threshold Confusion Matrix with enhanced styling
            fig, axes = plt.subplots(1, 2, figsize=(18, 6))
            fig.suptitle('Model Performance with Default Threshold (0.5)', fontsize=18, fontweight='bold')

            # Training set confusion matrix with enhanced styling
            sns.heatmap(cm_train_default, annot=True, fmt='d', cmap='Blues', ax=axes[0],
                        annot_kws={'size': 20, 'weight': 'bold'},  # 增大注释字体并加粗
                        linewidths=2, linecolor='black',  # 加粗单元格边框
                        cbar_kws={'shrink': 0.8})
            axes[0].set_title('Training Set - Default Threshold(0.5)', fontsize=16, fontweight='bold', pad=20)
            axes[0].set_xlabel('Predicted Label', fontsize=20, fontweight='bold')
            axes[0].set_ylabel('True Label', fontsize=20, fontweight='bold')
            axes[0].tick_params(axis='both', which='major', labelsize=14)

            # 加粗整个图的边框
            for spine in axes[0].spines.values():
                spine.set_linewidth(3)

            # Test set confusion matrix with enhanced styling
            sns.heatmap(cm_test_default, annot=True, fmt='d', cmap='Blues', ax=axes[1],
                        annot_kws={'size': 20, 'weight': 'bold'},  # 增大注释字体并加粗
                        linewidths=2, linecolor='black',  # 加粗单元格边框
                        cbar_kws={'shrink': 0.8})
            axes[1].set_title('Test Set - Default Threshold(0.5)', fontsize=18, fontweight='bold', pad=20)
            axes[1].set_xlabel('Predicted Label', fontsize=20, fontweight='bold')
            axes[1].set_ylabel('True Label', fontsize=20, fontweight='bold')
            axes[1].tick_params(axis='both', which='major', labelsize=14)

            # 加粗整个图的边框
            for spine in axes[1].spines.values():
                spine.set_linewidth(3)

            plt.tight_layout(rect=[0, 0, 1, 0.96])
            plt.show()

            # Probability distribution visualization
            plt.figure(figsize=(12, 6))

            plt.subplot(1, 2, 1)
            break_proba = y_test_pred_proba[y_test == 1]
            normal_proba = y_test_pred_proba[y_test == 0]

            plt.hist(normal_proba, bins=50, alpha=0.7, label='Normal Events', color='green', edgecolor='black')
            plt.hist(break_proba, bins=20, alpha=0.7, label='Yarn Break Events', color='red', edgecolor='black')
            plt.axvline(x=0.5, color='orange', linestyle='--', linewidth=2, label='Default Threshold(0.5)')
            plt.xlabel('Predicted Probability', fontweight='bold')
            plt.ylabel('Frequency', fontweight='bold')
            plt.title('Test Set Predicted Probability Distribution', fontweight='bold')
            plt.legend()
            plt.grid(True, alpha=0.3)

            plt.subplot(1, 2, 2)
            # ROC curve
            from sklearn.metrics import roc_curve

            fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {test_roc_auc:.3f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
            plt.xlabel('False Positive Rate', fontweight='bold')
            plt.ylabel('True Positive Rate', fontweight='bold')
            plt.title('ROC Curve', fontweight='bold')
            plt.legend(loc='lower right')
            plt.grid(True, alpha=0.3)

            plt.tight_layout()
            plt.show()

            # Final conclusions
            print("\n" + "=" * 80)
            print("🎯 Final Conclusions and Recommendations")
            print("=" * 80)
            print("✅ Model Performance Summary:")
            print(f"   - ROC-AUC: {test_roc_auc:.3f}")
            print(f"   - F1-Score: {test_f1_def:.4f}")
            print(f"   - Precision: {test_precision_def:.4f}")
            print(f"   - Recall: {test_recall_def:.4f}")
            print(f"   - Accuracy: {test_accuracy_def:.4f}")

            print("\n📋 Production Environment Recommendations:")
            print("1. Model is ready for deployment with default threshold (0.5)")
            print("2. Monitor model performance and data drift in production")
            print("3. Consider periodic retraining with new data")

            print(f"\n💡 Technical Notes:")
            print("- Model shows good discrimination ability with ROC-AUC performance")
            print("- Performance metrics indicate balanced performance across classes")
            print("- Confusion matrix shows model's prediction pattern")

            # Save model information
            model_info = {
                'model': rf_balanced_model,
                'performance': {
                    'test_accuracy': test_accuracy_def,
                    'test_precision': test_precision_def,
                    'test_recall': test_recall_def,
                    'test_f1': test_f1_def,
                    'test_roc_auc': test_roc_auc
                }
            }

            joblib.dump(model_info, 'rf_balanced_default.pkl')
            print(f"\n💾 Model information saved to: rf_balanced_default.pkl")

        except Exception as e:
            print(f"Error during evaluation: {e}")

except Exception as e:
    print(f"Error during model loading or evaluation: {e}")

print("\nEvaluation completed!")

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== RF_balanced Model Evaluation (Default Threshold) ===")
print("=" * 50)

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    model_info = joblib.load('rf_balanced_updated.pkl')

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']

    print(f"Model loaded successfully")
    print(f"Class weights used: {class_weights}")

    # Get probability predictions
    y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

    # Default threshold (0.5) results
    y_train_pred_default = (y_train_pred_proba >= 0.5).astype(int)
    y_test_pred_default = (y_test_pred_proba >= 0.5).astype(int)

    # Calculate metrics with default threshold
    train_accuracy_def = accuracy_score(y_train, y_train_pred_default)
    train_precision_def = precision_score(y_train, y_train_pred_default, zero_division=0)
    train_recall_def = recall_score(y_train, y_train_pred_default, zero_division=0)
    train_f1_def = f1_score(y_train, y_train_pred_default, zero_division=0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

    test_accuracy_def = accuracy_score(y_test, y_test_pred_default)
    test_precision_def = precision_score(y_test, y_test_pred_default, zero_division=0)
    test_recall_def = recall_score(y_test, y_test_pred_default, zero_division=0)
    test_f1_def = f1_score(y_test, y_test_pred_default, zero_division=0)
    test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # Create results table
    results_comparison = []

    # Add results to table
    results_comparison.append({
        'Threshold': 'Default (0.5)',
        'Dataset': 'Training Set',
        'Accuracy': train_accuracy_def,
        'Precision': train_precision_def,
        'Recall': train_recall_def,
        'F1-Score': train_f1_def,
        'ROC-AUC': train_roc_auc
    })

    results_comparison.append({
        'Threshold': 'Default (0.5)',
        'Dataset': 'Test Set',
        'Accuracy': test_accuracy_def,
        'Precision': test_precision_def,
        'Recall': test_recall_def,
        'F1-Score': test_f1_def,
        'ROC-AUC': test_roc_auc
    })

    results_df = pd.DataFrame(results_comparison)

    # Print results
    print("\n" + "=" * 80)
    print("Model Performance with Default Threshold (0.5)")
    print("=" * 80)
    print(results_df.round(4))

    # Confusion matrix - Only Default Threshold
    cm_train_default = confusion_matrix(y_train, y_train_pred_default)
    cm_test_default = confusion_matrix(y_test, y_test_pred_default)

    print(f"\nDefault Threshold(0.5) - Training Set Confusion Matrix:")
    print(cm_train_default)
    print(f"Default Threshold(0.5) - Test Set Confusion Matrix:")
    print(cm_test_default)

    # Detailed classification report - Only Default Threshold
    print(f"\nDefault Threshold(0.5) - Test Set Detailed Classification Report:")
    print(classification_report(y_test, y_test_pred_default, target_names=['Normal', 'Yarn Break']))

    # Visualization - Only Default Threshold Confusion Matrix
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('Model Performance with Default Threshold (0.5)', fontsize=16, fontweight='bold')

    # ==================== 修改位置1：训练集混淆矩阵字体设置 ====================
    # Training set confusion matrix with enhanced styling
    sns.heatmap(cm_train_default, annot=True, fmt='d', cmap='Blues', ax=axes[0],
                annot_kws={'size': 16, 'weight': 'bold'},  # 增大注释字体并加粗
                linewidths=2, linecolor='black',  # 加粗单元格边框
                cbar_kws={'shrink': 0.8})
    axes[0].set_title('Training Set - Default Threshold(0.5)', fontsize=16, fontweight='bold', pad=20)
    axes[0].set_xlabel('Predicted Label', fontsize=20, fontweight='bold')  # 从14改为20
    axes[0].set_ylabel('True Label', fontsize=20, fontweight='bold')       # 从14改为20
    axes[0].tick_params(axis='both', which='major', labelsize=18)          # 从13改为18

    # 加粗整个图的边框
    for spine in axes[0].spines.values():
        spine.set_linewidth(3)

    # ==================== 修改位置2：测试集混淆矩阵字体设置 ====================
    # Test set confusion matrix with enhanced styling
    sns.heatmap(cm_test_default, annot=True, fmt='d', cmap='Blues', ax=axes[1],
                annot_kws={'size': 16, 'weight': 'bold'},  # 增大注释字体并加粗
                linewidths=2, linecolor='black',  # 加粗单元格边框
                cbar_kws={'shrink': 0.8})
    axes[1].set_title('Test Set - Default Threshold(0.5)', fontsize=16, fontweight='bold', pad=20)
    axes[1].set_xlabel('Predicted Label', fontsize=20, fontweight='bold')  # 从14改为20
    axes[1].set_ylabel('True Label', fontsize=20, fontweight='bold')       # 从14改为20
    axes[1].tick_params(axis='both', which='major', labelsize=18)          # 从13改为18

    # 加粗整个图的边框
    for spine in axes[1].spines.values():
        spine.set_linewidth(3)

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

    # Probability distribution visualization
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    break_proba = y_test_pred_proba[y_test == 1]
    normal_proba = y_test_pred_proba[y_test == 0]

    plt.hist(normal_proba, bins=50, alpha=0.7, label='Normal Events', color='green', edgecolor='black')
    plt.hist(break_proba, bins=20, alpha=0.7, label='Yarn Break Events', color='red', edgecolor='black')
    plt.axvline(x=0.5, color='orange', linestyle='--', linewidth=2, label='Default Threshold(0.5)')
    plt.xlabel('Predicted Probability', fontweight='bold')
    plt.ylabel('Frequency', fontweight='bold')
    plt.title('Test Set Predicted Probability Distribution', fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    # ROC curve
    from sklearn.metrics import roc_curve

    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {test_roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlabel('False Positive Rate', fontweight='bold')
    plt.ylabel('True Positive Rate', fontweight='bold')
    plt.title('ROC Curve', fontweight='bold')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Final conclusions
    print("\n" + "=" * 80)
    print("🎯 Final Conclusions and Recommendations")
    print("=" * 80)
    print("✅ Model Performance Summary:")
    print(f"   - ROC-AUC: {test_roc_auc:.3f}")
    print(f"   - F1-Score: {test_f1_def:.4f}")
    print(f"   - Precision: {test_precision_def:.4f}")
    print(f"   - Recall: {test_recall_def:.4f}")
    print(f"   - Accuracy: {test_accuracy_def:.4f}")

    print("\n📋 Production Environment Recommendations:")
    print("1. Model is ready for deployment with default threshold (0.5)")
    print("2. Monitor model performance and data drift in production")
    print("3. Consider periodic retraining with new data")

    print(f"\n💡 Technical Notes:")
    print("- Model shows good discrimination ability with ROC-AUC performance")
    print("- Performance metrics indicate balanced performance across classes")
    print("- Confusion matrix shows model's prediction pattern")

    # Save model information
    model_info = {
        'model': rf_balanced_model,
        'class_weights': class_weights,
        'performance': {
            'test_accuracy': test_accuracy_def,
            'test_precision': test_precision_def,
            'test_recall': test_recall_def,
            'test_f1': test_f1_def,
            'test_roc_auc': test_roc_auc
        }
    }

    joblib.dump(model_info, 'rf_balanced_default.pkl')
    print(f"\n💾 Model information saved to: rf_balanced_default.pkl")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")

    # If saved model is not found, fall back to original code
    print("Attempting to use model from original models_balanced...")

    # Extract RF_balanced model directly from models_balanced
    rf_balanced_model = None
    for name, model in models_balanced:
        if name == 'RF_balanced':
            rf_balanced_model = model
            break

    if rf_balanced_model is None:
        print("Error: RF_balanced model not found")
    else:
        try:
            # Train model (if not already trained)
            rf_balanced_model.fit(X_train, y_train)

            # Get probability predictions
            y_train_pred_proba = rf_balanced_model.predict_proba(X_train)[:, 1]
            y_test_pred_proba = rf_balanced_model.predict_proba(X_test)[:, 1]

            # Default threshold (0.5) results
            y_train_pred_default = (y_train_pred_proba >= 0.5).astype(int)
            y_test_pred_default = (y_test_pred_proba >= 0.5).astype(int)

            # Calculate metrics with default threshold
            train_accuracy_def = accuracy_score(y_train, y_train_pred_default)
            train_precision_def = precision_score(y_train, y_train_pred_default, zero_division=0)
            train_recall_def = recall_score(y_train, y_train_pred_default, zero_division=0)
            train_f1_def = f1_score(y_train, y_train_pred_default, zero_division=0)
            train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)

            test_accuracy_def = accuracy_score(y_test, y_test_pred_default)
            test_precision_def = precision_score(y_test, y_test_pred_default, zero_division=0)
            test_recall_def = recall_score(y_test, y_test_pred_default, zero_division=0)
            test_f1_def = f1_score(y_test, y_test_pred_default, zero_division=0)
            test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

            # Create results table
            results_comparison = []

            # Add results to table
            results_comparison.append({
                'Threshold': 'Default (0.5)',
                'Dataset': 'Training Set',
                'Accuracy': train_accuracy_def,
                'Precision': train_precision_def,
                'Recall': train_recall_def,
                'F1-Score': train_f1_def,
                'ROC-AUC': train_roc_auc
            })

            results_comparison.append({
                'Threshold': 'Default (0.5)',
                'Dataset': 'Test Set',
                'Accuracy': test_accuracy_def,
                'Precision': test_precision_def,
                'Recall': test_recall_def,
                'F1-Score': test_f1_def,
                'ROC-AUC': test_roc_auc
            })

            results_df = pd.DataFrame(results_comparison)

            # Print results
            print("\n" + "=" * 80)
            print("Model Performance with Default Threshold (0.5)")
            print("=" * 80)
            print(results_df.round(4))

            # Confusion matrix - Only Default Threshold
            cm_train_default = confusion_matrix(y_train, y_train_pred_default)
            cm_test_default = confusion_matrix(y_test, y_test_pred_default)

            print(f"\nDefault Threshold(0.5) - Training Set Confusion Matrix:")
            print(cm_train_default)
            print(f"Default Threshold(0.5) - Test Set Confusion Matrix:")
            print(cm_test_default)

            # Detailed classification report - Only Default Threshold
            print(f"\nDefault Threshold(0.5) - Test Set Detailed Classification Report:")
            print(classification_report(y_test, y_test_pred_default, target_names=['Normal', 'Yarn Break']))

            # Visualization - Only Default Threshold Confusion Matrix with enhanced styling
            fig, axes = plt.subplots(1, 2, figsize=(18, 6))
            fig.suptitle('Model Performance with Default Threshold (0.5)', fontsize=18, fontweight='bold')

            # ==================== 修改位置3：训练集混淆矩阵字体设置（异常处理部分） ====================
            # Training set confusion matrix with enhanced styling
            sns.heatmap(cm_train_default, annot=True, fmt='d', cmap='Blues', ax=axes[0],
                        annot_kws={'size': 20, 'weight': 'bold'},  # 增大注释字体并加粗
                        linewidths=2, linecolor='black',  # 加粗单元格边框
                        cbar_kws={'shrink': 0.8})
            axes[0].set_title('Training Set - Default Threshold(0.5)', fontsize=16, fontweight='bold', pad=20)
            axes[0].set_xlabel('Predicted Label', fontsize=20, fontweight='bold')  # 从20改为20（保持不变）
            axes[0].set_ylabel('True Label', fontsize=20, fontweight='bold')       # 从20改为20（保持不变）
            axes[0].tick_params(axis='both', which='major', labelsize=18)          # 从14改为18

            # 加粗整个图的边框
            for spine in axes[0].spines.values():
                spine.set_linewidth(3)

            # ==================== 修改位置4：测试集混淆矩阵字体设置（异常处理部分） ====================
            # Test set confusion matrix with enhanced styling
            sns.heatmap(cm_test_default, annot=True, fmt='d', cmap='Blues', ax=axes[1],
                        annot_kws={'size': 20, 'weight': 'bold'},  # 增大注释字体并加粗
                        linewidths=2, linecolor='black',  # 加粗单元格边框
                        cbar_kws={'shrink': 0.8})
            axes[1].set_title('Test Set - Default Threshold(0.5)', fontsize=18, fontweight='bold', pad=20)
            axes[1].set_xlabel('Predicted Label', fontsize=20, fontweight='bold')  # 从20改为20（保持不变）
            axes[1].set_ylabel('True Label', fontsize=20, fontweight='bold')       # 从20改为20（保持不变）
            axes[1].tick_params(axis='both', which='major', labelsize=18)          # 从14改为18

            # 加粗整个图的边框
            for spine in axes[1].spines.values():
                spine.set_linewidth(3)

            plt.tight_layout(rect=[0, 0, 1, 0.96])
            plt.show()

            # Probability distribution visualization
            plt.figure(figsize=(12, 6))

            plt.subplot(1, 2, 1)
            break_proba = y_test_pred_proba[y_test == 1]
            normal_proba = y_test_pred_proba[y_test == 0]

            plt.hist(normal_proba, bins=50, alpha=0.7, label='Normal Events', color='green', edgecolor='black')
            plt.hist(break_proba, bins=20, alpha=0.7, label='Yarn Break Events', color='red', edgecolor='black')
            plt.axvline(x=0.5, color='orange', linestyle='--', linewidth=2, label='Default Threshold(0.5)')
            plt.xlabel('Predicted Probability', fontweight='bold')
            plt.ylabel('Frequency', fontweight='bold')
            plt.title('Test Set Predicted Probability Distribution', fontweight='bold')
            plt.legend()
            plt.grid(True, alpha=0.3)

            plt.subplot(1, 2, 2)
            # ROC curve
            from sklearn.metrics import roc_curve

            fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {test_roc_auc:.3f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
            plt.xlabel('False Positive Rate', fontweight='bold')
            plt.ylabel('True Positive Rate', fontweight='bold')
            plt.title('ROC Curve', fontweight='bold')
            plt.legend(loc='lower right')
            plt.grid(True, alpha=0.3)

            plt.tight_layout()
            plt.show()

            # Final conclusions
            print("\n" + "=" * 80)
            print("🎯 Final Conclusions and Recommendations")
            print("=" * 80)
            print("✅ Model Performance Summary:")
            print(f"   - ROC-AUC: {test_roc_auc:.3f}")
            print(f"   - F1-Score: {test_f1_def:.4f}")
            print(f"   - Precision: {test_precision_def:.4f}")
            print(f"   - Recall: {test_recall_def:.4f}")
            print(f"   - Accuracy: {test_accuracy_def:.4f}")

            print("\n📋 Production Environment Recommendations:")
            print("1. Model is ready for deployment with default threshold (0.5)")
            print("2. Monitor model performance and data drift in production")
            print("3. Consider periodic retraining with new data")

            print(f"\n💡 Technical Notes:")
            print("- Model shows good discrimination ability with ROC-AUC performance")
            print("- Performance metrics indicate balanced performance across classes")
            print("- Confusion matrix shows model's prediction pattern")

            # Save model information
            model_info = {
                'model': rf_balanced_model,
                'performance': {
                    'test_accuracy': test_accuracy_def,
                    'test_precision': test_precision_def,
                    'test_recall': test_recall_def,
                    'test_f1': test_f1_def,
                    'test_roc_auc': test_roc_auc
                }
            }

            joblib.dump(model_info, 'rf_balanced_default.pkl')
            print(f"\n💾 Model information saved to: rf_balanced_default.pkl")

        except Exception as e:
            print(f"Error during evaluation: {e}")

except Exception as e:
    print(f"Error during model loading or evaluation: {e}")

print("\nEvaluation completed!")

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>19 |</span></b> <b>Hyperparameter optimization</b></div>

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# import numpy as np
# import time
# import pandas as pd
#
# # 初始化列表存储分类指标
# accuracy_scores = []
# precision_scores = []
# recall_scores = []
# f1_scores = []
# roc_auc_scores = []
# execution_times = []
# model_names = []
#
# # 定义分类模型
# models = [
#     ('RF', RandomForestClassifier()),
#     ('XGBoost', XGBClassifier()),
#     ('LightGBM', LGBMClassifier())
# ]
#
# # 定义超参数网格
# param_grids = {
#     'RF': {'n_estimators': [10, 30, 50, 70, 100], 'max_depth': [None, 5, 10, 20], 'class_weight': [None, 'balanced']},
#     'XGBoost': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'scale_pos_weight': [1, 10, 100]},
#     'LightGBM': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'is_unbalance': [True, False]}
# }
#
# # 主循环
# for name, classifier in models:
#     start_time = time.time()
#
#     # 超参数调优
#     if param_grids.get(name):
#         grid_search = GridSearchCV(classifier, param_grid=param_grids[name], cv=5, scoring='roc_auc', n_jobs=-1)
#         grid_search.fit(X_train, y_train)
#         best_model = grid_search.best_estimator_
#     else:
#         best_model = classifier
#         best_model.fit(X_train, y_train)
#
#     # 预测
#     y_pred = best_model.predict(X_test)
#     y_pred_proba = best_model.predict_proba(X_test)[:, 1]
#
#     # 计算分类指标
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     roc_auc = roc_auc_score(y_test, y_pred_proba)
#
#     accuracy_scores.append(accuracy)
#     precision_scores.append(precision)
#     recall_scores.append(recall)
#     f1_scores.append(f1)
#     roc_auc_scores.append(roc_auc)
#
#     # 记录执行时间
#     execution_time = time.time() - start_time
#     execution_times.append(execution_time)
#
#     # 记录模型名称
#     model_names.append(name)
#
# # 保存结果
# results_df = pd.DataFrame({
#     'Model': model_names,
#     'Accuracy': accuracy_scores,
#     'Precision': precision_scores,
#     'Recall': recall_scores,
#     'F1 Score': f1_scores,
#     'ROC-AUC': roc_auc_scores,
#     'Execution Time (s)': execution_times
# })
#
# results_df

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

# 设置中文字体和样式

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

print("=== RF_balanced模型参数调优与性能评估 ===")
print("包含交叉验证和超参数优化")
print("=" * 50)

# 初始化存储结果的列表
metrics_results = []
execution_times = []
cv_results = []
param_analysis = []

try:
    # 加载之前保存的更新后RF_balanced模型
    print("加载已保存的RF_balanced模型...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # 获取模型和相关信息
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']
    feature_names = model_info['feature_names']

    print(f"模型加载完成，耗时: {load_time:.2f}秒")
    print(f"使用的类别权重: {class_weights}")
    print(f"训练集断纱事件: {train_break_count} (占比: {train_break_ratio:.2%})")

    # 1. 交叉验证评估
    print("\n=== 交叉验证评估 ===")
    cv_start_time = time.time()

    # 使用分层K折交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 评估多个指标
    scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

    cv_scores = {}
    for metric in scoring_metrics:
        scores = cross_val_score(rf_balanced_model, X_train, y_train,
                               cv=cv, scoring=metric, n_jobs=-1)
        cv_scores[metric] = scores
        cv_results.append({
            'Metric': metric,
            'Mean_Score': np.mean(scores),
            'Std_Score': np.std(scores),
            'Scores': scores
        })

    cv_time = time.time() - cv_start_time
    print(f"交叉验证完成，耗时: {cv_time:.2f}秒")

    # 打印交叉验证结果
    print("\n交叉验证结果 (5折):")
    cv_df = pd.DataFrame(cv_results)
    print(cv_df[['Metric', 'Mean_Score', 'Std_Score']].round(4))

    # 2. 超参数优化
    print("\n=== 超参数优化 ===")
    param_search_start = time.time()

    # 定义参数网格
    param_dist = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # 使用随机搜索进行参数优化
    random_search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_dist,
        n_iter=50,  # 随机搜索的迭代次数
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    print("开始随机搜索参数优化...")
    random_search.fit(X_train, y_train)

    param_search_time = time.time() - param_search_start
    print(f"参数优化完成，耗时: {param_search_time:.2f}秒")

    # 输出最佳参数
    print("\n最佳参数组合:")
    best_params = random_search.best_params_
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    print(f"最佳交叉验证分数 (F1): {random_search.best_score_:.4f}")

    # 3. 参数影响分析
    print("\n=== 参数影响分析 ===")

    # 分析不同参数对性能的影响
    results_df = pd.DataFrame(random_search.cv_results_)

    # 分析主要参数的影响
    key_params = ['param_n_estimators', 'param_max_depth',
                 'param_min_samples_split', 'param_min_samples_leaf']

    for param in key_params:
        if param in results_df.columns:
            param_data = results_df.groupby(param)['mean_test_score'].agg(['mean', 'std', 'count']).reset_index()
            param_data = param_data.sort_values('mean', ascending=False)

            print(f"\n{param} 对性能的影响:")
            for _, row in param_data.head(10).iterrows():
                param_value = row[param]
                if hasattr(param_value, '__len__') and not isinstance(param_value, str):
                    param_value = str(param_value)
                print(f"  {param_value}: {row['mean']:.4f} ± {row['std']:.4f}")

            # 存储参数分析结果
            param_analysis.extend([
                {
                    'Parameter': param.replace('param_', ''),
                    'Value': row[param],
                    'Mean_Score': row['mean'],
                    'Std_Score': row['std'],
                    'Count': row['count']
                } for _, row in param_data.iterrows()
            ])

    # 4. 参数趋势可视化 - 新增详细可视化
    print("\n=== 生成参数趋势可视化 ===")

    # 创建参数分析DataFrame
    param_analysis_df = pd.DataFrame(param_analysis)

    # 为每个参数创建趋势图
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()

    # 颜色设置
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

    # 参数显示名称映射
    param_display_names = {
        'n_estimators': '树的数量 (n_estimators)',
        'max_depth': '最大深度 (max_depth)',
        'min_samples_split': '分裂最小样本数 (min_samples_split)',
        'min_samples_leaf': '叶节点最小样本数 (min_samples_leaf)'
    }

    for i, (param_name, display_name) in enumerate(param_display_names.items()):
        param_data = param_analysis_df[param_analysis_df['Parameter'] == param_name]

        if not param_data.empty:
            # 确保数值类型排序
            param_data = param_data.copy()

            # 处理max_depth中的None值
            if param_name == 'max_depth':
                param_data['Sort_Key'] = param_data['Value'].apply(lambda x: 1000 if x is None else x)
                param_data = param_data.sort_values('Sort_Key')
                x_labels = [str(x) if x is not None else 'None' for x in param_data['Value']]
            else:
                param_data = param_data.sort_values('Value')
                x_labels = [str(x) for x in param_data['Value']]

            # 绘制趋势线
            x_positions = range(len(param_data))
            axes[i].plot(x_positions, param_data['Mean_Score'],
                        'o-', linewidth=3, markersize=8, color=colors[i],
                        label='平均F1分数', markerfacecolor='white', markeredgewidth=2)

            # 添加误差线
            axes[i].fill_between(x_positions,
                               param_data['Mean_Score'] - param_data['Std_Score'],
                               param_data['Mean_Score'] + param_data['Std_Score'],
                               alpha=0.2, color=colors[i], label='标准差范围')

            # 标记最佳值
            best_idx = param_data['Mean_Score'].idxmax()
            best_x = list(x_positions)[list(param_data.index).index(best_idx)]
            best_score = param_data.loc[best_idx, 'Mean_Score']
            best_value = param_data.loc[best_idx, 'Value']

            axes[i].axvline(x=best_x, color='red', linestyle='--', alpha=0.8, linewidth=2)
            axes[i].plot(best_x, best_score, 'o', markersize=10, color='red',
                       label=f'最佳值: {best_value} (F1={best_score:.3f})')

            # 设置图表属性
            axes[i].set_title(f'{display_name} 对模型性能的影响', fontsize=14, fontweight='bold', pad=20)
            axes[i].set_xlabel('参数值', fontsize=12)
            axes[i].set_ylabel('F1分数', fontsize=12)
            axes[i].set_xticks(x_positions)
            axes[i].set_xticklabels(x_labels, rotation=45)
            axes[i].legend(loc='lower right' if param_name in ['n_estimators', 'max_depth'] else 'upper right')
            axes[i].grid(True, alpha=0.3)

            # 设置y轴范围，突出差异
            y_min = max(0.8, param_data['Mean_Score'].min() - 0.05)
            y_max = min(1.0, param_data['Mean_Score'].max() + 0.05)
            axes[i].set_ylim(y_min, y_max)

    plt.suptitle('随机森林参数对模型性能的影响趋势分析', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('parameter_trend_analysis_detailed.png', dpi=300, bbox_inches='tight')
    print("详细参数趋势分析图已保存至: parameter_trend_analysis_detailed.png")

    # 5. 创建参数热力图
    print("\n=== 生成参数组合热力图 ===")

    # 提取前20个最佳参数组合进行热力分析
    top_results = results_df.nlargest(20, 'mean_test_score')[['params', 'mean_test_score']].copy()

    # 解析参数
    param_values_heatmap = []
    for _, row in top_results.iterrows():
        params = eval(row['params'])  # 将字符串转换为字典
        params['score'] = row['mean_test_score']
        param_values_heatmap.append(params)

    heatmap_df = pd.DataFrame(param_values_heatmap)

    # 创建热力图数据
    heatmap_data = heatmap_df.pivot_table(
        index='n_estimators',
        columns='max_depth',
        values='score',
        aggfunc='mean'
    ).fillna(0)

    # 绘制热力图
    plt.figure(figsize=(10, 8))
    sns.heatmap(heatmap_data, annot=True, fmt='.3f', cmap='YlOrRd',
                cbar_kws={'label': 'F1分数'}, linewidths=0.5)
    plt.title('不同参数组合的性能热力图\n(n_estimators vs max_depth)', fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('最大深度 (max_depth)', fontsize=12)
    plt.ylabel('树的数量 (n_estimators)', fontsize=12)
    plt.tight_layout()
    plt.savefig('parameter_heatmap.png', dpi=300, bbox_inches='tight')
    print("参数组合热力图已保存至: parameter_heatmap.png")

    # 6. 使用最佳参数训练新模型
    print("\n=== 使用最佳参数训练新模型 ===")
    training_start = time.time()

    # 创建使用最佳参数的新模型
    optimized_rf_model = RandomForestClassifier(
        **best_params,
        random_state=42,
        n_jobs=-1
    )

    # 训练模型
    optimized_rf_model.fit(X_train, y_train)
    training_time = time.time() - training_start
    print(f"模型训练完成，耗时: {training_time:.2f}秒")

    # 7. 评估优化后的模型
    print("\n=== 优化后模型性能评估 ===")
    predict_start_time = time.time()

    # 在训练集和测试集上进行预测
    y_train_pred_opt = optimized_rf_model.predict(X_train)
    y_test_pred_opt = optimized_rf_model.predict(X_test)

    # 获取概率预测
    y_train_pred_proba_opt = optimized_rf_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba_opt = optimized_rf_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"预测完成，耗时: {predict_time:.2f}秒")

    # 计算训练集指标
    train_accuracy_opt = accuracy_score(y_train, y_train_pred_opt)
    train_precision_opt = precision_score(y_train, y_train_pred_opt, zero_division=0)
    train_recall_opt = recall_score(y_train, y_train_pred_opt, zero_division=0)
    train_f1_opt = f1_score(y_train, y_train_pred_opt, zero_division=0)
    train_roc_auc_opt = roc_auc_score(y_train, y_train_pred_proba_opt)

    # 计算测试集指标
    test_accuracy_opt = accuracy_score(y_test, y_test_pred_opt)
    test_precision_opt = precision_score(y_test, y_test_pred_opt, zero_division=0)
    test_recall_opt = recall_score(y_test, y_test_pred_opt, zero_division=0)
    test_f1_opt = f1_score(y_test, y_test_pred_opt, zero_division=0)
    test_roc_auc_opt = roc_auc_score(y_test, y_test_pred_proba_opt)

    # 计算混淆矩阵
    cm_train_opt = confusion_matrix(y_train, y_train_pred_opt)
    cm_test_opt = confusion_matrix(y_test, y_test_pred_opt)

    total_time = load_time + cv_time + param_search_time + training_time + predict_time

    # 存储优化后模型的结果
    metrics_results.append({
        'Dataset': '优化后-训练集',
        'Accuracy': train_accuracy_opt,
        'Precision': train_precision_opt,
        'Recall': train_recall_opt,
        'F1-Score': train_f1_opt,
        'ROC-AUC': train_roc_auc_opt,
        'Parameters': str(best_params)
    })

    metrics_results.append({
        'Dataset': '优化后-测试集',
        'Accuracy': test_accuracy_opt,
        'Precision': test_precision_opt,
        'Recall': test_recall_opt,
        'F1-Score': test_f1_opt,
        'ROC-AUC': test_roc_auc_opt,
        'Parameters': str(best_params)
    })

    execution_times.append({
        '模型加载时间': load_time,
        '交叉验证时间': cv_time,
        '参数搜索时间': param_search_time,
        '模型训练时间': training_time,
        '预测时间': predict_time,
        '总时间': total_time
    })

    # 打印优化后模型结果
    print("\n优化后RF_balanced模型评估结果:")
    print("训练集性能:")
    print(f"  准确率: {train_accuracy_opt:.4f}")
    print(f"  精确率: {train_precision_opt:.4f}")
    print(f"  召回率: {train_recall_opt:.4f}")
    print(f"  F1分数: {train_f1_opt:.4f}")
    print(f"  ROC-AUC: {train_roc_auc_opt:.4f}")

    print("\n测试集性能:")
    print(f"  准确率: {test_accuracy_opt:.4f}")
    print(f"  精确率: {test_precision_opt:.4f}")
    print(f"  召回率: {test_recall_opt:.4f}")
    print(f"  F1分数: {test_f1_opt:.4f}")
    print(f"  ROC-AUC: {test_roc_auc_opt:.4f}")

    print("\n混淆矩阵:")
    print("训练集混淆矩阵:")
    print(cm_train_opt)
    print("测试集混淆矩阵:")
    print(cm_test_opt)

    print("\n详细分类报告 - 测试集:")
    print(classification_report(y_test, y_test_pred_opt, target_names=['正常', '断纱']))

    # 8. 保存优化后的模型
    print("\n=== 保存优化后的模型 ===")
    optimized_model_info = {
        'model': optimized_rf_model,
        'best_params': best_params,
        'class_weights': class_weights,
        'train_break_count': train_break_count,
        'train_break_ratio': train_break_ratio,
        'feature_names': feature_names,
        'cv_results': cv_results,
        'param_analysis': param_analysis,
        'optimization_time': param_search_time
    }

    joblib.dump(optimized_model_info, 'rf_balanced_optimized.pkl')
    print("优化后的模型已保存至: rf_balanced_optimized.pkl")

    # 9. 最终结果汇总
    print("\n" + "=" * 80)
    print("RF_balanced模型优化总结")
    print("=" * 80)

    # 创建结果DataFrame
    results_df_final = pd.DataFrame(metrics_results)
    print("\n模型性能总结:")
    print(results_df_final.round(4))

    # 创建时间统计DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\n时间统计:")
    print(time_df.round(2))

    # 保存所有结果到CSV文件
    results_df_final.to_csv('RF_balanced_optimized_performance.csv', index=False)
    time_df.to_csv('RF_balanced_optimized_timing.csv', index=False)
    cv_df.to_csv('RF_balanced_cross_validation.csv', index=False)
    param_analysis_df.to_csv('RF_balanced_parameter_analysis.csv', index=False)

    print(f"\n所有结果已保存至CSV文件")

    # 特征重要性分析
    print("\n=== 优化后模型特征重要性分析 ===")
    feature_importances_opt = optimized_rf_model.feature_importances_

    # 创建特征重要性DataFrame
    importance_df_opt = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances_opt
    }).sort_values('Importance', ascending=False)

    print("前10个最重要的特征:")
    print(importance_df_opt.head(10).round(4))

    # 保存特征重要性
    importance_df_opt.to_csv('RF_balanced_optimized_feature_importance.csv', index=False)
    print(f"特征重要性已保存至: RF_balanced_optimized_feature_importance.csv")

    print("\n优化流程完成！")
    print(f"总执行时间: {total_time:.2f}秒")

except FileNotFoundError:
    print("错误: 未找到保存的RF_balanced模型文件 'rf_balanced_updated.pkl'")
    print("请先运行模型创建和保存代码")
except Exception as e:
    print(f"模型加载或评估过程中出错: {e}")
    import traceback
    traceback.print_exc()

print("\n所有评估和优化完成！")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 设置中文字体和样式
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")

print("=== RF_balanced模型参数调优与性能评估 ===")
print("包含交叉验证和超参数优化")
print("=" * 50)

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

# 初始化存储结果的列表
metrics_results = []
execution_times = []
cv_results = []
param_analysis = []

try:
    # 加载之前保存的更新后RF_balanced模型
    print("加载已保存的RF_balanced模型...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # 获取模型和相关信息
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']
    feature_names = model_info['feature_names']

    print(f"模型加载完成，耗时: {load_time:.2f}秒")
    print(f"使用的类别权重: {class_weights}")
    print(f"训练集断纱事件: {train_break_count} (占比: {train_break_ratio:.2%})")

    # 1. 交叉验证评估 - 输出所有指标
    print("\n=== 交叉验证评估 ===")
    cv_start_time = time.time()

    # 使用分层K折交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 评估多个指标
    scoring_metrics = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'roc_auc': 'roc_auc'
    }

    cv_scores = {}
    cv_detailed_results = []

    print("进行5折交叉验证，评估多个指标...")
    for metric_name, metric_scorer in scoring_metrics.items():
        scores = cross_val_score(rf_balanced_model, X_train, y_train,
                               cv=cv, scoring=metric_scorer, n_jobs=-1)
        cv_scores[metric_name] = scores

        # 存储详细结果
        for fold_idx, score in enumerate(scores):
            cv_detailed_results.append({
                'Fold': fold_idx + 1,
                'Metric': metric_name,
                'Score': score
            })

        # 输出每个指标的统计信息
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        cv_results.append({
            'Metric': metric_name,
            'Mean_Score': mean_score,
            'Std_Score': std_score,
            'Scores': scores
        })

        print(f"  {metric_name}: {mean_score:.4f} ± {std_score:.4f}")

    cv_time = time.time() - cv_start_time
    print(f"交叉验证完成，耗时: {cv_time:.2f}秒")

    # 创建交叉验证详细结果DataFrame
    cv_detailed_df = pd.DataFrame(cv_detailed_results)

    # 打印每个折的详细结果
    print("\n交叉验证详细结果 (5折):")
    pivot_cv = cv_detailed_df.pivot_table(index='Fold', columns='Metric', values='Score')
    print(pivot_cv.round(4))

    # 打印总体统计
    print("\n交叉验证总体统计:")
    cv_df = pd.DataFrame(cv_results)
    print(cv_df[['Metric', 'Mean_Score', 'Std_Score']].round(4))

    # 2. 超参数优化
    print("\n=== 超参数优化 ===")
    param_search_start = time.time()

    # 定义参数网格
    param_dist = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # 使用随机搜索进行参数优化
    random_search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_dist,
        n_iter=50,  # 随机搜索的迭代次数
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    print("开始随机搜索参数优化...")
    random_search.fit(X_train, y_train)

    param_search_time = time.time() - param_search_start
    print(f"参数优化完成，耗时: {param_search_time:.2f}秒")

    # 输出最佳参数
    print("\n最佳参数组合:")
    best_params = random_search.best_params_
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    print(f"最佳交叉验证分数 (F1): {random_search.best_score_:.4f}")

    # 3. 参数影响分析
    print("\n=== 参数影响分析 ===")

    # 分析不同参数对性能的影响
    results_df = pd.DataFrame(random_search.cv_results_)

    # 分析主要参数的影响
    key_params = ['param_n_estimators', 'param_max_depth',
                 'param_min_samples_split', 'param_min_samples_leaf']

    for param in key_params:
        if param in results_df.columns:
            param_data = results_df.groupby(param)['mean_test_score'].agg(['mean', 'std', 'count']).reset_index()
            param_data = param_data.sort_values('mean', ascending=False)

            print(f"\n{param} 对性能的影响:")
            for _, row in param_data.head(10).iterrows():
                param_value = row[param]
                if hasattr(param_value, '__len__') and not isinstance(param_value, str):
                    param_value = str(param_value)
                print(f"  {param_value}: {row['mean']:.4f} ± {row['std']:.4f}")

            # 存储参数分析结果
            param_analysis.extend([
                {
                    'Parameter': param.replace('param_', ''),
                    'Value': row[param],
                    'Mean_Score': row['mean'],
                    'Std_Score': row['std'],
                    'Count': row['count']
                } for _, row in param_data.iterrows()
            ])

    # 4. 使用最佳参数训练新模型
    print("\n=== 使用最佳参数训练新模型 ===")
    training_start = time.time()

    # 创建使用最佳参数的新模型
    optimized_rf_model = RandomForestClassifier(
        **best_params,
        random_state=42,
        n_jobs=-1
    )

    # 训练模型
    optimized_rf_model.fit(X_train, y_train)
    training_time = time.time() - training_start
    print(f"模型训练完成，耗时: {training_time:.2f}秒")

    # 5. 评估优化后的模型
    print("\n=== 优化后模型性能评估 ===")
    predict_start_time = time.time()

    # 在训练集和测试集上进行预测
    y_train_pred_opt = optimized_rf_model.predict(X_train)
    y_test_pred_opt = optimized_rf_model.predict(X_test)

    # 获取概率预测
    y_train_pred_proba_opt = optimized_rf_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba_opt = optimized_rf_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"预测完成，耗时: {predict_time:.2f}秒")

    # 计算训练集指标
    train_accuracy_opt = accuracy_score(y_train, y_train_pred_opt)
    train_precision_opt = precision_score(y_train, y_train_pred_opt, zero_division=0)
    train_recall_opt = recall_score(y_train, y_train_pred_opt, zero_division=0)
    train_f1_opt = f1_score(y_train, y_train_pred_opt, zero_division=0)
    train_roc_auc_opt = roc_auc_score(y_train, y_train_pred_proba_opt)

    # 计算测试集指标
    test_accuracy_opt = accuracy_score(y_test, y_test_pred_opt)
    test_precision_opt = precision_score(y_test, y_test_pred_opt, zero_division=0)
    test_recall_opt = recall_score(y_test, y_test_pred_opt, zero_division=0)
    test_f1_opt = f1_score(y_test, y_test_pred_opt, zero_division=0)
    test_roc_auc_opt = roc_auc_score(y_test, y_test_pred_proba_opt)

    # 计算混淆矩阵
    cm_train_opt = confusion_matrix(y_train, y_train_pred_opt)
    cm_test_opt = confusion_matrix(y_test, y_test_pred_opt)

    total_time = load_time + cv_time + param_search_time + training_time + predict_time

    # 6. 可视化混淆矩阵
    print("\n=== 生成混淆矩阵可视化 ===")

    # 创建训练集混淆矩阵热力图
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_train_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['正常', '断纱'],
                yticklabels=['正常', '断纱'])
    plt.title('训练集混淆矩阵 - 优化后RF_balanced模型', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('预测标签', fontsize=12)
    plt.ylabel('真实标签', fontsize=12)
    plt.tight_layout()
    plt.savefig('training_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("训练集混淆矩阵已保存至: training_confusion_matrix.png")

    # 创建测试集混淆矩阵热力图
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_test_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['正常', '断纱'],
                yticklabels=['正常', '断纱'])
    plt.title('测试集混淆矩阵 - 优化后RF_balanced模型', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('预测标签', fontsize=12)
    plt.ylabel('真实标签', fontsize=12)
    plt.tight_layout()
    plt.savefig('test_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("测试集混淆矩阵已保存至: test_confusion_matrix.png")

    # 7. 存储优化后模型的结果
    metrics_results.append({
        'Dataset': '优化后-训练集',
        'Accuracy': train_accuracy_opt,
        'Precision': train_precision_opt,
        'Recall': train_recall_opt,
        'F1-Score': train_f1_opt,
        'ROC-AUC': train_roc_auc_opt,
        'Parameters': str(best_params)
    })

    metrics_results.append({
        'Dataset': '优化后-测试集',
        'Accuracy': test_accuracy_opt,
        'Precision': test_precision_opt,
        'Recall': test_recall_opt,
        'F1-Score': test_f1_opt,
        'ROC-AUC': test_roc_auc_opt,
        'Parameters': str(best_params)
    })

    execution_times.append({
        '模型加载时间': load_time,
        '交叉验证时间': cv_time,
        '参数搜索时间': param_search_time,
        '模型训练时间': training_time,
        '预测时间': predict_time,
        '总时间': total_time
    })

    # 8. 打印优化后模型结果
    print("\n优化后RF_balanced模型评估结果:")
    print("训练集性能:")
    print(f"  准确率: {train_accuracy_opt:.4f}")
    print(f"  精确率: {train_precision_opt:.4f}")
    print(f"  召回率: {train_recall_opt:.4f}")
    print(f"  F1分数: {train_f1_opt:.4f}")
    print(f"  ROC-AUC: {train_roc_auc_opt:.4f}")

    print("\n测试集性能:")
    print(f"  准确率: {test_accuracy_opt:.4f}")
    print(f"  精确率: {test_precision_opt:.4f}")
    print(f"  召回率: {test_recall_opt:.4f}")
    print(f"  F1分数: {test_f1_opt:.4f}")
    print(f"  ROC-AUC: {test_roc_auc_opt:.4f}")

    print("\n混淆矩阵:")
    print("训练集混淆矩阵:")
    print(cm_train_opt)
    print("测试集混淆矩阵:")
    print(cm_test_opt)

    print("\n详细分类报告 - 训练集:")
    print(classification_report(y_train, y_train_pred_opt, target_names=['正常', '断纱']))

    print("\n详细分类报告 - 测试集:")
    print(classification_report(y_test, y_test_pred_opt, target_names=['正常', '断纱']))

    # 9. 保存优化后的模型
    print("\n=== 保存优化后的模型 ===")
    optimized_model_info = {
        'model': optimized_rf_model,
        'best_params': best_params,
        'class_weights': class_weights,
        'train_break_count': train_break_count,
        'train_break_ratio': train_break_ratio,
        'feature_names': feature_names,
        'cv_results': cv_results,
        'param_analysis': param_analysis,
        'optimization_time': param_search_time
    }

    # 更新模型文件
    joblib.dump(optimized_model_info, 'rf_balanced_updated.pkl')
    print("优化后的模型已更新至: rf_balanced_updated.pkl")

    # 同时保存一个优化版本的副本
    joblib.dump(optimized_model_info, 'rf_balanced_optimized.pkl')
    print("优化后的模型副本已保存至: rf_balanced_optimized.pkl")

    # 10. 参数趋势可视化
    print("\n=== 生成参数趋势可视化 ===")

    # 创建参数分析DataFrame
    param_analysis_df = pd.DataFrame(param_analysis)

    # 为每个参数创建趋势图
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()

    # 颜色设置
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

    # 参数显示名称映射
    param_display_names = {
        'n_estimators': '树的数量 (n_estimators)',
        'max_depth': '最大深度 (max_depth)',
        'min_samples_split': '分裂最小样本数 (min_samples_split)',
        'min_samples_leaf': '叶节点最小样本数 (min_samples_leaf)'
    }

    for i, (param_name, display_name) in enumerate(param_display_names.items()):
        param_data = param_analysis_df[param_analysis_df['Parameter'] == param_name]

        if not param_data.empty:
            # 确保数值类型排序
            param_data = param_data.copy()

            # 处理max_depth中的None值
            if param_name == 'max_depth':
                param_data['Sort_Key'] = param_data['Value'].apply(lambda x: 1000 if x is None else x)
                param_data = param_data.sort_values('Sort_Key')
                x_labels = [str(x) if x is not None else 'None' for x in param_data['Value']]
            else:
                param_data = param_data.sort_values('Value')
                x_labels = [str(x) for x in param_data['Value']]

            # 绘制趋势线
            x_positions = range(len(param_data))
            axes[i].plot(x_positions, param_data['Mean_Score'],
                        'o-', linewidth=3, markersize=8, color=colors[i],
                        label='平均F1分数', markerfacecolor='white', markeredgewidth=2)

            # 添加误差线
            axes[i].fill_between(x_positions,
                               param_data['Mean_Score'] - param_data['Std_Score'],
                               param_data['Mean_Score'] + param_data['Std_Score'],
                               alpha=0.2, color=colors[i], label='标准差范围')

            # 标记最佳值
            best_idx = param_data['Mean_Score'].idxmax()
            best_x = list(x_positions)[list(param_data.index).index(best_idx)]
            best_score = param_data.loc[best_idx, 'Mean_Score']
            best_value = param_data.loc[best_idx, 'Value']

            axes[i].axvline(x=best_x, color='red', linestyle='--', alpha=0.8, linewidth=2)
            axes[i].plot(best_x, best_score, 'o', markersize=10, color='red',
                       label=f'最佳值: {best_value} (F1={best_score:.3f})')

            # 设置图表属性
            axes[i].set_title(f'{display_name} 对模型性能的影响', fontsize=14, fontweight='bold', pad=20)
            axes[i].set_xlabel('参数值', fontsize=12)
            axes[i].set_ylabel('F1分数', fontsize=12)
            axes[i].set_xticks(x_positions)
            axes[i].set_xticklabels(x_labels, rotation=45)
            axes[i].legend(loc='lower right' if param_name in ['n_estimators', 'max_depth'] else 'upper right')
            axes[i].grid(True, alpha=0.3)

            # 设置y轴范围，突出差异
            y_min = max(0.8, param_data['Mean_Score'].min() - 0.05)
            y_max = min(1.0, param_data['Mean_Score'].max() + 0.05)
            axes[i].set_ylim(y_min, y_max)

    plt.suptitle('随机森林参数对模型性能的影响趋势分析', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('parameter_trend_analysis_detailed.png', dpi=300, bbox_inches='tight')
    print("详细参数趋势分析图已保存至: parameter_trend_analysis_detailed.png")

    # 11. 最终结果汇总
    print("\n" + "=" * 80)
    print("RF_balanced模型优化总结")
    print("=" * 80)

    # 创建结果DataFrame
    results_df_final = pd.DataFrame(metrics_results)
    print("\n模型性能总结:")
    print(results_df_final.round(4))

    # 创建时间统计DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\n时间统计:")
    print(time_df.round(2))

    # 保存所有结果到CSV文件
    results_df_final.to_csv('RF_balanced_optimized_performance.csv', index=False)
    time_df.to_csv('RF_balanced_optimized_timing.csv', index=False)
    cv_df.to_csv('RF_balanced_cross_validation.csv', index=False)
    param_analysis_df.to_csv('RF_balanced_parameter_analysis.csv', index=False)
    cv_detailed_df.to_csv('RF_balanced_cross_validation_detailed.csv', index=False)

    print(f"\n所有结果已保存至CSV文件")

    # 特征重要性分析
    print("\n=== 优化后模型特征重要性分析 ===")
    feature_importances_opt = optimized_rf_model.feature_importances_

    # 创建特征重要性DataFrame
    importance_df_opt = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances_opt
    }).sort_values('Importance', ascending=False)

    print("前10个最重要的特征:")
    print(importance_df_opt.head(10).round(4))

    # 保存特征重要性
    importance_df_opt.to_csv('RF_balanced_optimized_feature_importance.csv', index=False)
    print(f"特征重要性已保存至: RF_balanced_optimized_feature_importance.csv")

    print("\n优化流程完成！")
    print(f"总执行时间: {total_time:.2f}秒")

except FileNotFoundError:
    print("错误: 未找到保存的RF_balanced模型文件 'rf_balanced_updated.pkl'")
    print("请先运行模型创建和保存代码")
except Exception as e:
    print(f"模型加载或评估过程中出错: {e}")
    import traceback
    traceback.print_exc()

print("\n所有评估和优化完成！")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== RF_balanced Model Hyperparameter Tuning and Performance Evaluation ===")
print("Includes Cross-Validation and Hyperparameter Optimization")
print("=" * 50)

# Initialize lists to store results
metrics_results = []
execution_times = []
cv_results = []
param_analysis = []

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']
    feature_names = model_info['feature_names']

    print(f"Model loaded successfully, time: {load_time:.2f} seconds")
    print(f"Class weights used: {class_weights}")
    print(f"Training set yarn break events: {train_break_count} (ratio: {train_break_ratio:.2%})")

    # 1. Cross-validation evaluation - Output all metrics
    print("\n=== Cross-Validation Evaluation ===")
    cv_start_time = time.time()

    # Use stratified K-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate multiple metrics
    scoring_metrics = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'roc_auc': 'roc_auc'
    }

    cv_scores = {}
    cv_detailed_results = []

    print("Performing 5-fold cross-validation, evaluating multiple metrics...")
    for metric_name, metric_scorer in scoring_metrics.items():
        scores = cross_val_score(rf_balanced_model, X_train, y_train,
                               cv=cv, scoring=metric_scorer, n_jobs=-1)
        cv_scores[metric_name] = scores

        # Store detailed results
        for fold_idx, score in enumerate(scores):
            cv_detailed_results.append({
                'Fold': fold_idx + 1,
                'Metric': metric_name,
                'Score': score
            })

        # Output statistics for each metric
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        cv_results.append({
            'Metric': metric_name,
            'Mean_Score': mean_score,
            'Std_Score': std_score,
            'Scores': scores
        })

        print(f"  {metric_name}: {mean_score:.4f} ± {std_score:.4f}")

    cv_time = time.time() - cv_start_time
    print(f"Cross-validation completed, time: {cv_time:.2f} seconds")

    # Create cross-validation detailed results DataFrame
    cv_detailed_df = pd.DataFrame(cv_detailed_results)

    # Print detailed results for each fold
    print("\nCross-validation detailed results (5 folds):")
    pivot_cv = cv_detailed_df.pivot_table(index='Fold', columns='Metric', values='Score')
    print(pivot_cv.round(4))

    # Print overall statistics
    print("\nCross-validation overall statistics:")
    cv_df = pd.DataFrame(cv_results)
    print(cv_df[['Metric', 'Mean_Score', 'Std_Score']].round(4))

    # 2. Hyperparameter optimization
    print("\n=== Hyperparameter Optimization ===")
    param_search_start = time.time()

    # Define parameter grid
    param_dist = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Use randomized search for parameter optimization
    random_search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_dist,
        n_iter=50,  # Number of iterations for random search
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    print("Starting randomized search parameter optimization...")
    random_search.fit(X_train, y_train)

    param_search_time = time.time() - param_search_start
    print(f"Parameter optimization completed, time: {param_search_time:.2f} seconds")

    # Output best parameters
    print("\nBest parameter combination:")
    best_params = random_search.best_params_
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    print(f"Best cross-validation score (F1): {random_search.best_score_:.4f}")

    # 3. Parameter impact analysis
    print("\n=== Parameter Impact Analysis ===")

    # Analyze the impact of different parameters on performance
    results_df = pd.DataFrame(random_search.cv_results_)

    # Analyze impact of main parameters
    key_params = ['param_n_estimators', 'param_max_depth',
                 'param_min_samples_split', 'param_min_samples_leaf']

    for param in key_params:
        if param in results_df.columns:
            param_data = results_df.groupby(param)['mean_test_score'].agg(['mean', 'std', 'count']).reset_index()
            param_data = param_data.sort_values('mean', ascending=False)

            print(f"\nImpact of {param} on performance:")
            for _, row in param_data.head(10).iterrows():
                param_value = row[param]
                if hasattr(param_value, '__len__') and not isinstance(param_value, str):
                    param_value = str(param_value)
                print(f"  {param_value}: {row['mean']:.4f} ± {row['std']:.4f}")

            # Store parameter analysis results
            param_analysis.extend([
                {
                    'Parameter': param.replace('param_', ''),
                    'Value': row[param],
                    'Mean_Score': row['mean'],
                    'Std_Score': row['std'],
                    'Count': row['count']
                } for _, row in param_data.iterrows()
            ])

    # 4. Train new model with best parameters
    print("\n=== Training New Model with Best Parameters ===")
    training_start = time.time()

    # Create new model with best parameters
    optimized_rf_model = RandomForestClassifier(
        **best_params,
        random_state=42,
        n_jobs=-1
    )

    # Train the model
    optimized_rf_model.fit(X_train, y_train)
    training_time = time.time() - training_start
    print(f"Model training completed, time: {training_time:.2f} seconds")

    # 5. Evaluate the optimized model
    print("\n=== Optimized Model Performance Evaluation ===")
    predict_start_time = time.time()

    # Make predictions on training and test sets
    y_train_pred_opt = optimized_rf_model.predict(X_train)
    y_test_pred_opt = optimized_rf_model.predict(X_test)

    # Get probability predictions
    y_train_pred_proba_opt = optimized_rf_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba_opt = optimized_rf_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"Prediction completed, time: {predict_time:.2f} seconds")

    # Calculate training set metrics
    train_accuracy_opt = accuracy_score(y_train, y_train_pred_opt)
    train_precision_opt = precision_score(y_train, y_train_pred_opt, zero_division=0)
    train_recall_opt = recall_score(y_train, y_train_pred_opt, zero_division=0)
    train_f1_opt = f1_score(y_train, y_train_pred_opt, zero_division=0)
    train_roc_auc_opt = roc_auc_score(y_train, y_train_pred_proba_opt)

    # Calculate test set metrics
    test_accuracy_opt = accuracy_score(y_test, y_test_pred_opt)
    test_precision_opt = precision_score(y_test, y_test_pred_opt, zero_division=0)
    test_recall_opt = recall_score(y_test, y_test_pred_opt, zero_division=0)
    test_f1_opt = f1_score(y_test, y_test_pred_opt, zero_division=0)
    test_roc_auc_opt = roc_auc_score(y_test, y_test_pred_proba_opt)

    # Calculate confusion matrices
    cm_train_opt = confusion_matrix(y_train, y_train_pred_opt)
    cm_test_opt = confusion_matrix(y_test, y_test_pred_opt)

    total_time = load_time + cv_time + param_search_time + training_time + predict_time

    # 6. Visualize confusion matrices with enhanced styling
    print("\n=== Generating Confusion Matrix Visualizations ===")

    # Create training set confusion matrix heatmap with enhanced styling
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_train_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 16, 'weight': 'bold'},  # Enlarge and bold annotation font
                linewidths=2, linecolor='black')  # Bold cell borders

    plt.title('Training Set Confusion Matrix - Optimized RF_balanced Model',
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')

    # Bold the entire plot border
    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_linewidth(3)

    plt.tight_layout()
    plt.savefig('training_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Training set confusion matrix saved to: training_confusion_matrix.png")

    # Create test set confusion matrix heatmap with enhanced styling
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_test_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 16, 'weight': 'bold'},  # Enlarge and bold annotation font
                linewidths=2, linecolor='black')  # Bold cell borders

    plt.title('Test Set Confusion Matrix - Optimized RF_balanced Model',
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')

    # Bold the entire plot border
    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_linewidth(3)

    plt.tight_layout()
    plt.savefig('test_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Test set confusion matrix saved to: test_confusion_matrix.png")

    # 7. Store optimized model results
    metrics_results.append({
        'Dataset': 'Optimized-Training Set',
        'Accuracy': train_accuracy_opt,
        'Precision': train_precision_opt,
        'Recall': train_recall_opt,
        'F1-Score': train_f1_opt,
        'ROC-AUC': train_roc_auc_opt,
        'Parameters': str(best_params)
    })

    metrics_results.append({
        'Dataset': 'Optimized-Test Set',
        'Accuracy': test_accuracy_opt,
        'Precision': test_precision_opt,
        'Recall': test_recall_opt,
        'F1-Score': test_f1_opt,
        'ROC-AUC': test_roc_auc_opt,
        'Parameters': str(best_params)
    })

    execution_times.append({
        'Model Loading Time': load_time,
        'Cross-Validation Time': cv_time,
        'Parameter Search Time': param_search_time,
        'Model Training Time': training_time,
        'Prediction Time': predict_time,
        'Total Time': total_time
    })

    # 8. Print optimized model results
    print("\nOptimized RF_balanced Model Evaluation Results:")
    print("Training Set Performance:")
    print(f"  Accuracy: {train_accuracy_opt:.4f}")
    print(f"  Precision: {train_precision_opt:.4f}")
    print(f"  Recall: {train_recall_opt:.4f}")
    print(f"  F1-Score: {train_f1_opt:.4f}")
    print(f"  ROC-AUC: {train_roc_auc_opt:.4f}")

    print("\nTest Set Performance:")
    print(f"  Accuracy: {test_accuracy_opt:.4f}")
    print(f"  Precision: {test_precision_opt:.4f}")
    print(f"  Recall: {test_recall_opt:.4f}")
    print(f"  F1-Score: {test_f1_opt:.4f}")
    print(f"  ROC-AUC: {test_roc_auc_opt:.4f}")

    print("\nConfusion Matrices:")
    print("Training Set Confusion Matrix:")
    print(cm_train_opt)
    print("Test Set Confusion Matrix:")
    print(cm_test_opt)

    print("\nDetailed Classification Report - Training Set:")
    print(classification_report(y_train, y_train_pred_opt, target_names=['Normal', 'Yarn Break']))

    print("\nDetailed Classification Report - Test Set:")
    print(classification_report(y_test, y_test_pred_opt, target_names=['Normal', 'Yarn Break']))

    # 9. Save the optimized model
    print("\n=== Saving Optimized Model ===")
    optimized_model_info = {
        'model': optimized_rf_model,
        'best_params': best_params,
        'class_weights': class_weights,
        'train_break_count': train_break_count,
        'train_break_ratio': train_break_ratio,
        'feature_names': feature_names,
        'cv_results': cv_results,
        'param_analysis': param_analysis,
        'optimization_time': param_search_time
    }

    # Update model file
    joblib.dump(optimized_model_info, 'rf_balanced_updated.pkl')
    print("Optimized model updated to: rf_balanced_updated.pkl")

    # Also save an optimized version copy
    joblib.dump(optimized_model_info, 'rf_balanced_optimized.pkl')
    print("Optimized model copy saved to: rf_balanced_optimized.pkl")

    # 10. Parameter trend visualization
    print("\n=== Generating Parameter Trend Visualization ===")

    # Create parameter analysis DataFrame
    param_analysis_df = pd.DataFrame(param_analysis)

    # Create trend charts for each parameter
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()

    # Color settings
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

    # Parameter display name mapping
    param_display_names = {
        'n_estimators': 'Number of Trees (n_estimators)',
        'max_depth': 'Maximum Depth (max_depth)',
        'min_samples_split': 'Minimum Samples Split (min_samples_split)',
        'min_samples_leaf': 'Minimum Samples Leaf (min_samples_leaf)'
    }

    for i, (param_name, display_name) in enumerate(param_display_names.items()):
        param_data = param_analysis_df[param_analysis_df['Parameter'] == param_name]

        if not param_data.empty:
            # Ensure numeric type sorting
            param_data = param_data.copy()

            # Handle None values in max_depth
            if param_name == 'max_depth':
                param_data['Sort_Key'] = param_data['Value'].apply(lambda x: 1000 if x is None else x)
                param_data = param_data.sort_values('Sort_Key')
                x_labels = [str(x) if x is not None else 'None' for x in param_data['Value']]
            else:
                param_data = param_data.sort_values('Value')
                x_labels = [str(x) for x in param_data['Value']]

            # Plot trend line
            x_positions = range(len(param_data))
            axes[i].plot(x_positions, param_data['Mean_Score'],
                        'o-', linewidth=3, markersize=8, color=colors[i],
                        label='Mean F1-Score', markerfacecolor='white', markeredgewidth=2)

            # Add error bars
            axes[i].fill_between(x_positions,
                               param_data['Mean_Score'] - param_data['Std_Score'],
                               param_data['Mean_Score'] + param_data['Std_Score'],
                               alpha=0.2, color=colors[i], label='Standard Deviation Range')

            # Mark best value
            best_idx = param_data['Mean_Score'].idxmax()
            best_x = list(x_positions)[list(param_data.index).index(best_idx)]
            best_score = param_data.loc[best_idx, 'Mean_Score']
            best_value = param_data.loc[best_idx, 'Value']

            axes[i].axvline(x=best_x, color='red', linestyle='--', alpha=0.8, linewidth=2)
            axes[i].plot(best_x, best_score, 'o', markersize=10, color='red',
                       label=f'Best Value: {best_value} (F1={best_score:.3f})')

            # Set chart properties
            axes[i].set_title(f'Impact of {display_name} on Model Performance',
                             fontsize=18, fontweight='bold', pad=20)
            axes[i].set_xlabel('Parameter Value', fontsize=18, fontweight='bold')
            axes[i].set_ylabel('F1-Score', fontsize=18, fontweight='bold')
            axes[i].set_xticks(x_positions)
            axes[i].set_xticklabels(x_labels, rotation=45)
            axes[i].legend(loc='lower right' if param_name in ['n_estimators', 'max_depth'] else 'upper right')
            axes[i].grid(True, alpha=0.3)

            # Set y-axis range to highlight differences
            y_min = max(0.8, param_data['Mean_Score'].min() - 0.05)
            y_max = min(1.0, param_data['Mean_Score'].max() + 0.05)
            axes[i].set_ylim(y_min, y_max)

    plt.suptitle('Random Forest Parameter Impact on Model Performance Trend Analysis',
                 fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('parameter_trend_analysis_detailed.png', dpi=1200, bbox_inches='tight')
    print("Detailed parameter trend analysis chart saved to: parameter_trend_analysis_detailed.png")

    # 11. Final results summary
    print("\n" + "=" * 80)
    print("RF_balanced Model Optimization Summary")
    print("=" * 80)

    # Create results DataFrame
    results_df_final = pd.DataFrame(metrics_results)
    print("\nModel Performance Summary:")
    print(results_df_final.round(4))

    # Create time statistics DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\nTime Statistics:")
    print(time_df.round(2))

    # Save all results to CSV files
    results_df_final.to_csv('RF_balanced_optimized_performance.csv', index=False)
    time_df.to_csv('RF_balanced_optimized_timing.csv', index=False)
    cv_df.to_csv('RF_balanced_cross_validation.csv', index=False)
    param_analysis_df.to_csv('RF_balanced_parameter_analysis.csv', index=False)
    cv_detailed_df.to_csv('RF_balanced_cross_validation_detailed.csv', index=False)

    print(f"\nAll results saved to CSV files")

    # Feature importance analysis
    print("\n=== Optimized Model Feature Importance Analysis ===")
    feature_importances_opt = optimized_rf_model.feature_importances_

    # Create feature importance DataFrame
    importance_df_opt = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances_opt
    }).sort_values('Importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(importance_df_opt.head(10).round(4))

    # Save feature importance
    importance_df_opt.to_csv('RF_balanced_optimized_feature_importance.csv', index=False)
    print(f"Feature importance saved to: RF_balanced_optimized_feature_importance.csv")

    print("\nOptimization process completed!")
    print(f"Total execution time: {total_time:.2f} seconds")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")
except Exception as e:
    print(f"Error during model loading or evaluation: {e}")
    import traceback
    traceback.print_exc()

print("\nAll evaluation and optimization completed!")

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

# 设置绘图样式 - 英文专业格式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== RF_balanced Model Hyperparameter Tuning and Performance Evaluation ===")
print("Includes Cross-Validation and Hyperparameter Optimization")
print("=" * 50)

# Initialize lists to store results
metrics_results = []
execution_times = []
cv_results = []
param_analysis = []

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']
    feature_names = model_info['feature_names']

    print(f"Model loaded successfully, time: {load_time:.2f} seconds")
    print(f"Class weights used: {class_weights}")
    print(f"Training set yarn break events: {train_break_count} (ratio: {train_break_ratio:.2%})")

    # 1. Cross-validation evaluation - Output all metrics
    print("\n=== Cross-Validation Evaluation ===")
    cv_start_time = time.time()

    # Use stratified K-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate multiple metrics
    scoring_metrics = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'roc_auc': 'roc_auc'
    }

    cv_scores = {}
    cv_detailed_results = []

    print("Performing 5-fold cross-validation, evaluating multiple metrics...")
    for metric_name, metric_scorer in scoring_metrics.items():
        scores = cross_val_score(rf_balanced_model, X_train, y_train,
                               cv=cv, scoring=metric_scorer, n_jobs=-1)
        cv_scores[metric_name] = scores

        # Store detailed results
        for fold_idx, score in enumerate(scores):
            cv_detailed_results.append({
                'Fold': fold_idx + 1,
                'Metric': metric_name,
                'Score': score
            })

        # Output statistics for each metric
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        cv_results.append({
            'Metric': metric_name,
            'Mean_Score': mean_score,
            'Std_Score': std_score,
            'Scores': scores
        })

        print(f"  {metric_name}: {mean_score:.4f} ± {std_score:.4f}")

    cv_time = time.time() - cv_start_time
    print(f"Cross-validation completed, time: {cv_time:.2f} seconds")

    # Create cross-validation detailed results DataFrame
    cv_detailed_df = pd.DataFrame(cv_detailed_results)

    # Print detailed results for each fold
    print("\nCross-validation detailed results (5 folds):")
    pivot_cv = cv_detailed_df.pivot_table(index='Fold', columns='Metric', values='Score')
    print(pivot_cv.round(4))

    # Print overall statistics
    print("\nCross-validation overall statistics:")
    cv_df = pd.DataFrame(cv_results)
    print(cv_df[['Metric', 'Mean_Score', 'Std_Score']].round(4))

    # 2. Hyperparameter optimization
    print("\n=== Hyperparameter Optimization ===")
    param_search_start = time.time()

    # Define parameter grid
    param_dist = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Use randomized search for parameter optimization
    random_search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_dist,
        n_iter=50,  # Number of iterations for random search
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    print("Starting randomized search parameter optimization...")
    random_search.fit(X_train, y_train)

    param_search_time = time.time() - param_search_start
    print(f"Parameter optimization completed, time: {param_search_time:.2f} seconds")

    # Output best parameters
    print("\nBest parameter combination:")
    best_params = random_search.best_params_
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    print(f"Best cross-validation score (F1): {random_search.best_score_:.4f}")

    # 3. Parameter impact analysis
    print("\n=== Parameter Impact Analysis ===")

    # Analyze the impact of different parameters on performance
    results_df = pd.DataFrame(random_search.cv_results_)

    # Analyze impact of main parameters
    key_params = ['param_n_estimators', 'param_max_depth',
                 'param_min_samples_split', 'param_min_samples_leaf']

    for param in key_params:
        if param in results_df.columns:
            param_data = results_df.groupby(param)['mean_test_score'].agg(['mean', 'std', 'count']).reset_index()
            param_data = param_data.sort_values('mean', ascending=False)

            print(f"\nImpact of {param} on performance:")
            for _, row in param_data.head(10).iterrows():
                param_value = row[param]
                if hasattr(param_value, '__len__') and not isinstance(param_value, str):
                    param_value = str(param_value)
                print(f"  {param_value}: {row['mean']:.4f} ± {row['std']:.4f}")

            # Store parameter analysis results
            param_analysis.extend([
                {
                    'Parameter': param.replace('param_', ''),
                    'Value': row[param],
                    'Mean_Score': row['mean'],
                    'Std_Score': row['std'],
                    'Count': row['count']
                } for _, row in param_data.iterrows()
            ])

    # 4. Train new model with best parameters
    print("\n=== Training New Model with Best Parameters ===")
    training_start = time.time()

    # Create new model with best parameters
    optimized_rf_model = RandomForestClassifier(
        **best_params,
        random_state=42,
        n_jobs=-1
    )

    # Train the model
    optimized_rf_model.fit(X_train, y_train)
    training_time = time.time() - training_start
    print(f"Model training completed, time: {training_time:.2f} seconds")

    # 5. Evaluate the optimized model
    print("\n=== Optimized Model Performance Evaluation ===")
    predict_start_time = time.time()

    # Make predictions on training and test sets
    y_train_pred_opt = optimized_rf_model.predict(X_train)
    y_test_pred_opt = optimized_rf_model.predict(X_test)

    # Get probability predictions
    y_train_pred_proba_opt = optimized_rf_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba_opt = optimized_rf_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"Prediction completed, time: {predict_time:.2f} seconds")

    # Calculate training set metrics
    train_accuracy_opt = accuracy_score(y_train, y_train_pred_opt)
    train_precision_opt = precision_score(y_train, y_train_pred_opt, zero_division=0)
    train_recall_opt = recall_score(y_train, y_train_pred_opt, zero_division=0)
    train_f1_opt = f1_score(y_train, y_train_pred_opt, zero_division=0)
    train_roc_auc_opt = roc_auc_score(y_train, y_train_pred_proba_opt)

    # Calculate test set metrics
    test_accuracy_opt = accuracy_score(y_test, y_test_pred_opt)
    test_precision_opt = precision_score(y_test, y_test_pred_opt, zero_division=0)
    test_recall_opt = recall_score(y_test, y_test_pred_opt, zero_division=0)
    test_f1_opt = f1_score(y_test, y_test_pred_opt, zero_division=0)
    test_roc_auc_opt = roc_auc_score(y_test, y_test_pred_proba_opt)

    # Calculate confusion matrices
    cm_train_opt = confusion_matrix(y_train, y_train_pred_opt)
    cm_test_opt = confusion_matrix(y_test, y_test_pred_opt)

    total_time = load_time + cv_time + param_search_time + training_time + predict_time

    # 6. Visualize confusion matrices in one combined figure with enhanced styling
    print("\n=== Generating Combined Confusion Matrix Visualization ===")

    # Create a combined figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    # Training set confusion matrix
    sns.heatmap(cm_train_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 18, 'weight': 'bold'},
                linewidths=2, linecolor='black', ax=ax1)
    ax1.set_title('Training Set Confusion Matrix\nOptimized RF_balanced Model',
                 fontsize=18, fontweight='bold', pad=20)
    ax1.set_xlabel('Predicted Label', fontsize=18, fontweight='bold')
    ax1.set_ylabel('True Label', fontsize=18, fontweight='bold')

    # Test set confusion matrix
    sns.heatmap(cm_test_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 18, 'weight': 'bold'},
                linewidths=2, linecolor='black', ax=ax2)
    ax2.set_title('Test Set Confusion Matrix\nOptimized RF_balanced Model',
                 fontsize=18, fontweight='bold', pad=20)
    ax2.set_xlabel('Predicted Label', fontsize=18, fontweight='bold')
    ax2.set_ylabel('True Label', fontsize=18, fontweight='bold')

    # Bold the borders for both subplots
    for ax in [ax1, ax2]:
        for spine in ax.spines.values():
            spine.set_linewidth(3)

    plt.suptitle('Optimized RF_balanced Model - Confusion Matrix Analysis',
                fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('combined_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Combined confusion matrix saved to: combined_confusion_matrix.png")

    # 7. Store optimized model results
    metrics_results.append({
        'Dataset': 'Optimized-Training Set',
        'Accuracy': train_accuracy_opt,
        'Precision': train_precision_opt,
        'Recall': train_recall_opt,
        'F1-Score': train_f1_opt,
        'ROC-AUC': train_roc_auc_opt,
        'Parameters': str(best_params)
    })

    metrics_results.append({
        'Dataset': 'Optimized-Test Set',
        'Accuracy': test_accuracy_opt,
        'Precision': test_precision_opt,
        'Recall': test_recall_opt,
        'F1-Score': test_f1_opt,
        'ROC-AUC': test_roc_auc_opt,
        'Parameters': str(best_params)
    })

    execution_times.append({
        'Model Loading Time': load_time,
        'Cross-Validation Time': cv_time,
        'Parameter Search Time': param_search_time,
        'Model Training Time': training_time,
        'Prediction Time': predict_time,
        'Total Time': total_time
    })

    # 8. Print optimized model results
    print("\nOptimized RF_balanced Model Evaluation Results:")
    print("Training Set Performance:")
    print(f"  Accuracy: {train_accuracy_opt:.4f}")
    print(f"  Precision: {train_precision_opt:.4f}")
    print(f"  Recall: {train_recall_opt:.4f}")
    print(f"  F1-Score: {train_f1_opt:.4f}")
    print(f"  ROC-AUC: {train_roc_auc_opt:.4f}")

    print("\nTest Set Performance:")
    print(f"  Accuracy: {test_accuracy_opt:.4f}")
    print(f"  Precision: {test_precision_opt:.4f}")
    print(f"  Recall: {test_recall_opt:.4f}")
    print(f"  F1-Score: {test_f1_opt:.4f}")
    print(f"  ROC-AUC: {test_roc_auc_opt:.4f}")

    print("\nConfusion Matrices:")
    print("Training Set Confusion Matrix:")
    print(cm_train_opt)
    print("Test Set Confusion Matrix:")
    print(cm_test_opt)

    print("\nDetailed Classification Report - Training Set:")
    print(classification_report(y_train, y_train_pred_opt, target_names=['Normal', 'Yarn Break']))

    print("\nDetailed Classification Report - Test Set:")
    print(classification_report(y_test, y_test_pred_opt, target_names=['Normal', 'Yarn Break']))

    # 9. Save the optimized model with comprehensive information
    print("\n=== Saving Optimized Model ===")
    optimized_model_info = {
        'model': optimized_rf_model,
        'best_params': best_params,
        'class_weights': class_weights,
        'train_break_count': train_break_count,
        'train_break_ratio': train_break_ratio,
        'feature_names': feature_names,
        'cv_results': cv_results,
        'param_analysis': param_analysis,
        'optimization_time': param_search_time,
        'training_metrics': {
            'accuracy': train_accuracy_opt,
            'precision': train_precision_opt,
            'recall': train_recall_opt,
            'f1_score': train_f1_opt,
            'roc_auc': train_roc_auc_opt
        },
        'test_metrics': {
            'accuracy': test_accuracy_opt,
            'precision': test_precision_opt,
            'recall': test_recall_opt,
            'f1_score': test_f1_opt,
            'roc_auc': test_roc_auc_opt
        },
        'confusion_matrices': {
            'train': cm_train_opt,
            'test': cm_test_opt
        },
        'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"),
        'model_version': 'optimized_v1.0'
    }

    # Save the optimized model with timestamp in filename
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    optimized_filename = f'rf_balanced_optimized_{timestamp}.pkl'
    joblib.dump(optimized_model_info, optimized_filename)
    print(f"Optimized model saved to: {optimized_filename}")

    # Also update the main model file for easy access
    joblib.dump(optimized_model_info, 'rf_balanced_optimized_latest.pkl')
    print("Latest optimized model saved to: rf_balanced_optimized_latest.pkl")

    # 10. Parameter trend visualization
    print("\n=== Generating Parameter Trend Visualization ===")

    # Create parameter analysis DataFrame
    param_analysis_df = pd.DataFrame(param_analysis)

    # Create trend charts for each parameter
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()

    # Color settings
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

    # Parameter display name mapping
    param_display_names = {
        'n_estimators': 'Number of Trees (n_estimators)',
        'max_depth': 'Maximum Depth (max_depth)',
        'min_samples_split': 'Minimum Samples Split (min_samples_split)',
        'min_samples_leaf': 'Minimum Samples Leaf (min_samples_leaf)'
    }

    for i, (param_name, display_name) in enumerate(param_display_names.items()):
        param_data = param_analysis_df[param_analysis_df['Parameter'] == param_name]

        if not param_data.empty:
            # Ensure numeric type sorting
            param_data = param_data.copy()

            # Handle None values in max_depth
            if param_name == 'max_depth':
                param_data['Sort_Key'] = param_data['Value'].apply(lambda x: 1000 if x is None else x)
                param_data = param_data.sort_values('Sort_Key')
                x_labels = [str(x) if x is not None else 'None' for x in param_data['Value']]
            else:
                param_data = param_data.sort_values('Value')
                x_labels = [str(x) for x in param_data['Value']]

            # Plot trend line
            x_positions = range(len(param_data))
            axes[i].plot(x_positions, param_data['Mean_Score'],
                        'o-', linewidth=3, markersize=8, color=colors[i],
                        label='Mean F1-Score', markerfacecolor='white', markeredgewidth=2)

            # Add error bars
            axes[i].fill_between(x_positions,
                               param_data['Mean_Score'] - param_data['Std_Score'],
                               param_data['Mean_Score'] + param_data['Std_Score'],
                               alpha=0.2, color=colors[i], label='Standard Deviation Range')

            # Mark best value
            best_idx = param_data['Mean_Score'].idxmax()
            best_x = list(x_positions)[list(param_data.index).index(best_idx)]
            best_score = param_data.loc[best_idx, 'Mean_Score']
            best_value = param_data.loc[best_idx, 'Value']

            axes[i].axvline(x=best_x, color='red', linestyle='--', alpha=0.8, linewidth=2)
            axes[i].plot(best_x, best_score, 'o', markersize=10, color='red',
                       label=f'Best Value: {best_value} (F1={best_score:.3f})')

            # Set chart properties
            axes[i].set_title(f'Impact of {display_name} on Model Performance',
                             fontsize=14, fontweight='bold', pad=20)
            axes[i].set_xlabel('Parameter Value', fontsize=12, fontweight='bold')
            axes[i].set_ylabel('F1-Score', fontsize=12, fontweight='bold')
            axes[i].set_xticks(x_positions)
            axes[i].set_xticklabels(x_labels, rotation=45)
            axes[i].legend(loc='lower right' if param_name in ['n_estimators', 'max_depth'] else 'upper right')
            axes[i].grid(True, alpha=0.3)

            # Set y-axis range to highlight differences
            y_min = max(0.8, param_data['Mean_Score'].min() - 0.05)
            y_max = min(1.0, param_data['Mean_Score'].max() + 0.05)
            axes[i].set_ylim(y_min, y_max)

    plt.suptitle('Random Forest Parameter Impact on Model Performance Trend Analysis',
                 fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('parameter_trend_analysis_detailed.png', dpi=300, bbox_inches='tight')
    print("Detailed parameter trend analysis chart saved to: parameter_trend_analysis_detailed.png")

    # 11. Final results summary
    print("\n" + "=" * 80)
    print("RF_balanced Model Optimization Summary")
    print("=" * 80)

    # Create results DataFrame
    results_df_final = pd.DataFrame(metrics_results)
    print("\nModel Performance Summary:")
    print(results_df_final.round(4))

    # Create time statistics DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\nTime Statistics:")
    print(time_df.round(2))

    # Save all results to CSV files
    results_df_final.to_csv('RF_balanced_optimized_performance.csv', index=False)
    time_df.to_csv('RF_balanced_optimized_timing.csv', index=False)
    cv_df.to_csv('RF_balanced_cross_validation.csv', index=False)
    param_analysis_df.to_csv('RF_balanced_parameter_analysis.csv', index=False)
    cv_detailed_df.to_csv('RF_balanced_cross_validation_detailed.csv', index=False)

    print(f"\nAll results saved to CSV files")

    # Feature importance analysis
    print("\n=== Optimized Model Feature Importance Analysis ===")
    feature_importances_opt = optimized_rf_model.feature_importances_

    # Create feature importance DataFrame
    importance_df_opt = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances_opt
    }).sort_values('Importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(importance_df_opt.head(10).round(4))

    # Save feature importance
    importance_df_opt.to_csv('RF_balanced_optimized_feature_importance.csv', index=False)
    print(f"Feature importance saved to: RF_balanced_optimized_feature_importance.csv")

    print("\nOptimization process completed!")
    print(f"Total execution time: {total_time:.2f} seconds")
    print(f"Optimized model ready for deployment: {optimized_filename}")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")
except Exception as e:
    print(f"Error during model loading or evaluation: {e}")
    import traceback
    traceback.print_exc()

print("\nAll evaluation and optimization completed!")
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 16,
    'axes.titlesize': 18,
    'axes.labelsize': 17,
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'legend.fontsize': 15,
    'grid.linewidth': 1.2,
    'lines.linewidth': 3.5,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.unicode_minus': False
})

print("=== RF_balanced Model Hyperparameter Tuning and Performance Evaluation ===")
print("Includes Cross-Validation and Hyperparameter Optimization")
print("=" * 50)

# Initialize lists to store results
metrics_results = []
execution_times = []
cv_results = []
param_analysis = []

try:
    # Load the previously saved updated RF_balanced model
    print("Loading saved RF_balanced model...")
    load_start_time = time.time()
    model_info = joblib.load('rf_balanced_updated.pkl')
    load_time = time.time() - load_start_time

    # Get model and related information
    rf_balanced_model = model_info['model']
    class_weights = model_info['class_weights']
    train_break_count = model_info['train_break_count']
    train_break_ratio = model_info['train_break_ratio']
    feature_names = model_info['feature_names']

    print(f"Model loaded successfully, time: {load_time:.2f} seconds")
    print(f"Class weights used: {class_weights}")
    print(f"Training set yarn break events: {train_break_count} (ratio: {train_break_ratio:.2%})")

    # 1. Cross-validation evaluation - Output all metrics
    print("\n=== Cross-Validation Evaluation ===")
    cv_start_time = time.time()

    # Use stratified K-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate multiple metrics
    scoring_metrics = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'roc_auc': 'roc_auc'
    }

    cv_scores = {}
    cv_detailed_results = []

    print("Performing 5-fold cross-validation, evaluating multiple metrics...")
    for metric_name, metric_scorer in scoring_metrics.items():
        scores = cross_val_score(rf_balanced_model, X_train, y_train,
                               cv=cv, scoring=metric_scorer, n_jobs=-1)
        cv_scores[metric_name] = scores

        # Store detailed results
        for fold_idx, score in enumerate(scores):
            cv_detailed_results.append({
                'Fold': fold_idx + 1,
                'Metric': metric_name,
                'Score': score
            })

        # Output statistics for each metric
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        cv_results.append({
            'Metric': metric_name,
            'Mean_Score': mean_score,
            'Std_Score': std_score,
            'Scores': scores
        })

        print(f"  {metric_name}: {mean_score:.4f} ± {std_score:.4f}")

    cv_time = time.time() - cv_start_time
    print(f"Cross-validation completed, time: {cv_time:.2f} seconds")

    # Create cross-validation detailed results DataFrame
    cv_detailed_df = pd.DataFrame(cv_detailed_results)

    # Print detailed results for each fold
    print("\nCross-validation detailed results (5 folds):")
    pivot_cv = cv_detailed_df.pivot_table(index='Fold', columns='Metric', values='Score')
    print(pivot_cv.round(4))

    # Print overall statistics
    print("\nCross-validation overall statistics:")
    cv_df = pd.DataFrame(cv_results)
    print(cv_df[['Metric', 'Mean_Score', 'Std_Score']].round(4))

    # 2. Hyperparameter optimization
    print("\n=== Hyperparameter Optimization ===")
    param_search_start = time.time()

    # Define parameter grid
    param_dist = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Use randomized search for parameter optimization
    random_search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_dist,
        n_iter=50,  # Number of iterations for random search
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    print("Starting randomized search parameter optimization...")
    random_search.fit(X_train, y_train)

    param_search_time = time.time() - param_search_start
    print(f"Parameter optimization completed, time: {param_search_time:.2f} seconds")

    # Output best parameters
    print("\nBest parameter combination:")
    best_params = random_search.best_params_
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    print(f"Best cross-validation score (F1): {random_search.best_score_:.4f}")

    # 3. Parameter impact analysis
    print("\n=== Parameter Impact Analysis ===")

    # Analyze the impact of different parameters on performance
    results_df = pd.DataFrame(random_search.cv_results_)

    # Analyze impact of main parameters
    key_params = ['param_n_estimators', 'param_max_depth',
                 'param_min_samples_split', 'param_min_samples_leaf']

    for param in key_params:
        if param in results_df.columns:
            param_data = results_df.groupby(param)['mean_test_score'].agg(['mean', 'std', 'count']).reset_index()
            param_data = param_data.sort_values('mean', ascending=False)

            print(f"\nImpact of {param} on performance:")
            for _, row in param_data.head(10).iterrows():
                param_value = row[param]
                if hasattr(param_value, '__len__') and not isinstance(param_value, str):
                    param_value = str(param_value)
                print(f"  {param_value}: {row['mean']:.4f} ± {row['std']:.4f}")

            # Store parameter analysis results
            param_analysis.extend([
                {
                    'Parameter': param.replace('param_', ''),
                    'Value': row[param],
                    'Mean_Score': row['mean'],
                    'Std_Score': row['std'],
                    'Count': row['count']
                } for _, row in param_data.iterrows()
            ])

    # 4. Train new model with best parameters
    print("\n=== Training New Model with Best Parameters ===")
    training_start = time.time()

    # Create new model with best parameters
    optimized_rf_model = RandomForestClassifier(
        **best_params,
        random_state=42,
        n_jobs=-1
    )

    # Train the model
    optimized_rf_model.fit(X_train, y_train)
    training_time = time.time() - training_start
    print(f"Model training completed, time: {training_time:.2f} seconds")

    # 5. Evaluate the optimized model
    print("\n=== Optimized Model Performance Evaluation ===")
    predict_start_time = time.time()

    # Make predictions on training and test sets
    y_train_pred_opt = optimized_rf_model.predict(X_train)
    y_test_pred_opt = optimized_rf_model.predict(X_test)

    # Get probability predictions
    y_train_pred_proba_opt = optimized_rf_model.predict_proba(X_train)[:, 1]
    y_test_pred_proba_opt = optimized_rf_model.predict_proba(X_test)[:, 1]

    predict_time = time.time() - predict_start_time
    print(f"Prediction completed, time: {predict_time:.2f} seconds")

    # Calculate training set metrics
    train_accuracy_opt = accuracy_score(y_train, y_train_pred_opt)
    train_precision_opt = precision_score(y_train, y_train_pred_opt, zero_division=0)
    train_recall_opt = recall_score(y_train, y_train_pred_opt, zero_division=0)
    train_f1_opt = f1_score(y_train, y_train_pred_opt, zero_division=0)
    train_roc_auc_opt = roc_auc_score(y_train, y_train_pred_proba_opt)

    # Calculate test set metrics
    test_accuracy_opt = accuracy_score(y_test, y_test_pred_opt)
    test_precision_opt = precision_score(y_test, y_test_pred_opt, zero_division=0)
    test_recall_opt = recall_score(y_test, y_test_pred_opt, zero_division=0)
    test_f1_opt = f1_score(y_test, y_test_pred_opt, zero_division=0)
    test_roc_auc_opt = roc_auc_score(y_test, y_test_pred_proba_opt)

    # Calculate confusion matrices
    cm_train_opt = confusion_matrix(y_train, y_train_pred_opt)
    cm_test_opt = confusion_matrix(y_test, y_test_pred_opt)

    total_time = load_time + cv_time + param_search_time + training_time + predict_time

    # 6. Visualize confusion matrices in one combined figure with enhanced styling
    print("\n=== Generating Combined Confusion Matrix Visualization ===")

    # Create a combined figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    # ==================== 修改位置1：混淆矩阵轴标签字体大小 ====================
    # Training set confusion matrix
    sns.heatmap(cm_train_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 18, 'weight': 'bold'},
                linewidths=2, linecolor='black', ax=ax1)
    ax1.set_title('Training Set Confusion Matrix\nOptimized RF_balanced Model',
                 fontsize=18, fontweight='bold', pad=20)
    ax1.set_xlabel('Predicted Label', fontsize=20, fontweight='bold')  # 从18改为20
    ax1.set_ylabel('True Label', fontsize=20, fontweight='bold')       # 从18改为20

    # Test set confusion matrix
    sns.heatmap(cm_test_opt, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Yarn Break'],
                yticklabels=['Normal', 'Yarn Break'],
                annot_kws={'size': 18, 'weight': 'bold'},
                linewidths=2, linecolor='black', ax=ax2)
    ax2.set_title('Test Set Confusion Matrix\nOptimized RF_balanced Model',
                 fontsize=18, fontweight='bold', pad=20)
    ax2.set_xlabel('Predicted Label', fontsize=20, fontweight='bold')  # 从18改为20
    ax2.set_ylabel('True Label', fontsize=20, fontweight='bold')       # 从18改为20

    # Bold the borders for both subplots
    for ax in [ax1, ax2]:
        for spine in ax.spines.values():
            spine.set_linewidth(3)

    plt.suptitle('Optimized RF_balanced Model - Confusion Matrix Analysis',
                fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('combined_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Combined confusion matrix saved to: combined_confusion_matrix.png")

    # 7. Store optimized model results
    metrics_results.append({
        'Dataset': 'Optimized-Training Set',
        'Accuracy': train_accuracy_opt,
        'Precision': train_precision_opt,
        'Recall': train_recall_opt,
        'F1-Score': train_f1_opt,
        'ROC-AUC': train_roc_auc_opt,
        'Parameters': str(best_params)
    })

    metrics_results.append({
        'Dataset': 'Optimized-Test Set',
        'Accuracy': test_accuracy_opt,
        'Precision': test_precision_opt,
        'Recall': test_recall_opt,
        'F1-Score': test_f1_opt,
        'ROC-AUC': test_roc_auc_opt,
        'Parameters': str(best_params)
    })

    execution_times.append({
        'Model Loading Time': load_time,
        'Cross-Validation Time': cv_time,
        'Parameter Search Time': param_search_time,
        'Model Training Time': training_time,
        'Prediction Time': predict_time,
        'Total Time': total_time
    })

    # 8. Print optimized model results
    print("\nOptimized RF_balanced Model Evaluation Results:")
    print("Training Set Performance:")
    print(f"  Accuracy: {train_accuracy_opt:.4f}")
    print(f"  Precision: {train_precision_opt:.4f}")
    print(f"  Recall: {train_recall_opt:.4f}")
    print(f"  F1-Score: {train_f1_opt:.4f}")
    print(f"  ROC-AUC: {train_roc_auc_opt:.4f}")

    print("\nTest Set Performance:")
    print(f"  Accuracy: {test_accuracy_opt:.4f}")
    print(f"  Precision: {test_precision_opt:.4f}")
    print(f"  Recall: {test_recall_opt:.4f}")
    print(f"  F1-Score: {test_f1_opt:.4f}")
    print(f"  ROC-AUC: {test_roc_auc_opt:.4f}")

    print("\nConfusion Matrices:")
    print("Training Set Confusion Matrix:")
    print(cm_train_opt)
    print("Test Set Confusion Matrix:")
    print(cm_test_opt)

    print("\nDetailed Classification Report - Training Set:")
    print(classification_report(y_train, y_train_pred_opt, target_names=['Normal', 'Yarn Break']))

    print("\nDetailed Classification Report - Test Set:")
    print(classification_report(y_test, y_test_pred_opt, target_names=['Normal', 'Yarn Break']))

    # 9. Save the optimized model with comprehensive information
    print("\n=== Saving Optimized Model ===")
    optimized_model_info = {
        'model': optimized_rf_model,
        'best_params': best_params,
        'class_weights': class_weights,
        'train_break_count': train_break_count,
        'train_break_ratio': train_break_ratio,
        'feature_names': feature_names,
        'cv_results': cv_results,
        'param_analysis': param_analysis,
        'optimization_time': param_search_time,
        'training_metrics': {
            'accuracy': train_accuracy_opt,
            'precision': train_precision_opt,
            'recall': train_recall_opt,
            'f1_score': train_f1_opt,
            'roc_auc': train_roc_auc_opt
        },
        'test_metrics': {
            'accuracy': test_accuracy_opt,
            'precision': test_precision_opt,
            'recall': test_recall_opt,
            'f1_score': test_f1_opt,
            'roc_auc': test_roc_auc_opt
        },
        'confusion_matrices': {
            'train': cm_train_opt,
            'test': cm_test_opt
        },
        'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"),
        'model_version': 'optimized_v1.0'
    }

    # Save the optimized model with timestamp in filename
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    optimized_filename = f'rf_balanced_optimized_{timestamp}.pkl'
    joblib.dump(optimized_model_info, optimized_filename)
    print(f"Optimized model saved to: {optimized_filename}")

    # Also update the main model file for easy access
    joblib.dump(optimized_model_info, 'rf_balanced_optimized_latest.pkl')
    print("Latest optimized model saved to: rf_balanced_optimized_latest.pkl")

    # 10. Parameter trend visualization (移除图例)
    print("\n=== Generating Parameter Trend Visualization ===")

    # Create parameter analysis DataFrame
    param_analysis_df = pd.DataFrame(param_analysis)

    # Create trend charts for each parameter
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()

    # Color settings
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']

    # Parameter display name mapping
    param_display_names = {
        'n_estimators': 'Number of Trees (n_estimators)',
        'max_depth': 'Maximum Depth (max_depth)',
        'min_samples_split': 'Minimum Samples Split (min_samples_split)',
        'min_samples_leaf': 'Minimum Samples Leaf (min_samples_leaf)'
    }

    for i, (param_name, display_name) in enumerate(param_display_names.items()):
        param_data = param_analysis_df[param_analysis_df['Parameter'] == param_name]

        if not param_data.empty:
            # Ensure numeric type sorting
            param_data = param_data.copy()

            # Handle None values in max_depth
            if param_name == 'max_depth':
                param_data['Sort_Key'] = param_data['Value'].apply(lambda x: 1000 if x is None else x)
                param_data = param_data.sort_values('Sort_Key')
                x_labels = [str(x) if x is not None else 'None' for x in param_data['Value']]
            else:
                param_data = param_data.sort_values('Value')
                x_labels = [str(x) for x in param_data['Value']]

            # Plot trend line
            x_positions = range(len(param_data))
            axes[i].plot(x_positions, param_data['Mean_Score'],
                        'o-', linewidth=3, markersize=8, color=colors[i],
                        label='Mean F1-Score', markerfacecolor='white', markeredgewidth=2)

            # Add error bars
            axes[i].fill_between(x_positions,
                               param_data['Mean_Score'] - param_data['Std_Score'],
                               param_data['Mean_Score'] + param_data['Std_Score'],
                               alpha=0.2, color=colors[i])

            # Mark best value
            best_idx = param_data['Mean_Score'].idxmax()
            best_x = list(x_positions)[list(param_data.index).index(best_idx)]
            best_score = param_data.loc[best_idx, 'Mean_Score']
            best_value = param_data.loc[best_idx, 'Value']

            axes[i].axvline(x=best_x, color='red', linestyle='--', alpha=0.8, linewidth=2)
            axes[i].plot(best_x, best_score, 'o', markersize=10, color='red')

            # ==================== 修改位置2：参数趋势图轴标签字体大小 ====================
            # Set chart properties
            axes[i].set_title(f'Impact of {display_name} on Model Performance',
                             fontsize=16, fontweight='bold', pad=20)
            axes[i].set_xlabel('Parameter Value', fontsize=20, fontweight='bold')  # 从12改为20
            axes[i].set_ylabel('F1-Score', fontsize=20, fontweight='bold')         # 从12改为20
            axes[i].set_xticks(x_positions)
            axes[i].set_xticklabels(x_labels, rotation=45, fontsize=14)
            # 移除图例以避免重叠
            # axes[i].legend(loc='lower right' if param_name in ['n_estimators', 'max_depth'] else 'upper right')
            axes[i].grid(True, alpha=0.3)

            # Set y-axis range to highlight differences
            y_min = max(0.8, param_data['Mean_Score'].min() - 0.05)
            y_max = min(1.0, param_data['Mean_Score'].max() + 0.05)
            axes[i].set_ylim(y_min, y_max)

    plt.suptitle('Random Forest Parameter Impact on Model Performance Trend Analysis',
                 fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('parameter_trend_analysis_detailed.png', dpi=300, bbox_inches='tight')
    print("Detailed parameter trend analysis chart saved to: parameter_trend_analysis_detailed.png")

    # 11. Final results summary
    print("\n" + "=" * 80)
    print("RF_balanced Model Optimization Summary")
    print("=" * 80)

    # Create results DataFrame
    results_df_final = pd.DataFrame(metrics_results)
    print("\nModel Performance Summary:")
    print(results_df_final.round(4))

    # Create time statistics DataFrame
    time_df = pd.DataFrame(execution_times)
    print("\nTime Statistics:")
    print(time_df.round(2))

    # Save all results to CSV files
    results_df_final.to_csv('RF_balanced_optimized_performance.csv', index=False)
    time_df.to_csv('RF_balanced_optimized_timing.csv', index=False)
    cv_df.to_csv('RF_balanced_cross_validation.csv', index=False)
    param_analysis_df.to_csv('RF_balanced_parameter_analysis.csv', index=False)
    cv_detailed_df.to_csv('RF_balanced_cross_validation_detailed.csv', index=False)

    print(f"\nAll results saved to CSV files")

    # Feature importance analysis
    print("\n=== Optimized Model Feature Importance Analysis ===")
    feature_importances_opt = optimized_rf_model.feature_importances_

    # Create feature importance DataFrame
    importance_df_opt = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances_opt
    }).sort_values('Importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(importance_df_opt.head(10).round(4))

    # Save feature importance
    importance_df_opt.to_csv('RF_balanced_optimized_feature_importance.csv', index=False)
    print(f"Feature importance saved to: RF_balanced_optimized_feature_importance.csv")

    print("\nOptimization process completed!")
    print(f"Total execution time: {total_time:.2f} seconds")
    print(f"Optimized model ready for deployment: {optimized_filename}")

except FileNotFoundError:
    print("Error: Saved RF_balanced model file 'rf_balanced_updated.pkl' not found")
    print("Please run the model creation and saving code first")
except Exception as e:
    print(f"Error during model loading or evaluation: {e}")
    import traceback
    traceback.print_exc()

print("\nAll evaluation and optimization completed!")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 数据准备
data = {
    'feature': ['V$_{1}$', 'P$_{1}$', 'R$_{3}$_380','R$_{3}$_375', 'R$_{6}$_600', 'R$_{2}$_9200',
                'R$_{4}$_300', 'R$_{2}$_9500', 'R$_{5}$_8.2', 'R$_{1}$_1300', 'R$_{2}$_9000',
                'R$_{3}$_443', 'R$_{5}$_9.1', 'R$_{1}$_2000', 'R$_{6}$_1100', 'R$_{5}$_11.2', 'R$_{3}$_365'],
    'importance': [0.414, 0.391, 0.054, 0.032, 0.019, 0.018,
                   0.015, 0.011, 0.010, 0.009, 0.008,
                   0.006, 0.005, 0.004, 0.002, 0.001, 0.001]
}
df = pd.DataFrame(data)

# 按照重要性从高到低排序
df = df.sort_values(by='importance', ascending=False)

# 设置绘图风格
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.size': 20,
    'axes.labelsize': 26,
    'axes.titlesize': 26,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'figure.figsize': (12, 8),
    'axes.grid': True,
    'grid.linestyle': '--',
    'grid.alpha': 0.3,
    'grid.color': 'gray',
    # 'axes.facecolor': 'white', 可以保留也可以去掉这一行，下面会显式设置
})

# 创建颜色映射 (蓝色渐变，颜色更鲜明)
colors = plt.cm.Blues(np.linspace(0.3, 0.9, len(df)))

# 创建图形
fig, ax = plt.subplots(figsize=(12, 8))

# 显式设置绘图区域（坐标轴内）的背景颜色为白色
ax.set_facecolor('white')

# 绘制横向柱状图，反转特征顺序
bars = ax.barh(
    df['feature'][::-1],
    df['importance'][::-1],
    color=colors[::-1],
    height=0.8,
    edgecolor='black',
    linewidth=2  # 加粗边框线
)

# 添加数据标签
for bar in bars:
    width = bar.get_width()
    ax.text(
        width + 0.001,
        bar.get_y() + bar.get_height() / 2,
        f'{width:.3f}',
        va='center',
        ha='left',
        fontsize=18,
        fontweight='bold',  # 加粗标签字体
        color='black'
    )

# 设置坐标轴
ax.set_xlabel('Feature Importance Score', fontweight='bold', labelpad=24)
ax.set_ylabel('Features', fontweight='bold', labelpad=2)
# ax.set_title('Feature Importance Ranking', pad=30, fontweight='bold')

# 调整 x 轴范围
ax.set_xlim(0, df['importance'].max() * 1.2)

# 隐藏上、右边框
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# 调整下、左边框样式
ax.spines['bottom'].set_linewidth(3)  # 加粗边框线
ax.spines['left'].set_linewidth(3)    # 加粗边框线
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')

# 调整刻度标签颜色和粗细
ax.tick_params(axis='x', colors='black', width=2, length=6)
ax.tick_params(axis='y', colors='black', width=2, length=6)

# 设置 y 轴标签字体为斜体
for tick in ax.get_yticklabels():
    tick.set_fontstyle('italic')

# 调整子图布局
plt.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1)

# 调整布局
plt.tight_layout()

# 保存图像
plt.savefig('feature_importance_bold_clear.png', bbox_inches='tight', dpi=300)

# 显示图形
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import time
import pandas as pd
from joblib import dump  # 用于保存模型

# 初始化列表存储分类指标
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
execution_times = []
model_names = []
best_params_list = []  # 存储最佳参数
feature_importances = []  # 存储特征重要性

# 定义分类模型
models = [
    ('RF', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('LightGBM', LGBMClassifier())
]

# 定义超参数网格
param_grids = {
    'RF': {'n_estimators': [10], 'max_depth': [None], 'class_weight': ['balanced']},
    'XGBoost': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'scale_pos_weight': [1, 10, 100]},
    'LightGBM': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'is_unbalance': [True, False]}
}

# 主循环
for name, classifier in models:
    start_time = time.time()

    # 超参数调优
    if param_grids.get(name):
        grid_search = GridSearchCV(classifier, param_grid=param_grids[name], cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_  # 获取最佳参数
    else:
        best_model = classifier
        best_model.fit(X_train, y_train)
        best_params = {}  # 如果没有调优，最佳参数为空字典

    # 预测
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)

    # 记录执行时间
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    # 记录模型名称和最佳参数
    model_names.append(name)
    best_params_list.append(best_params)

    # 获取特征重要性
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importances.append(importances)
    elif hasattr(best_model, 'coef_'):
        importances = best_model.coef_[0]  # 对于线性模型，取第一个类的系数
        feature_importances.append(importances)
    else:
        feature_importances.append(None)

    # 保存 Random Forest 模型
    if name == 'RF':
        dump(best_model, 'rf_model.joblib')

# 保存结果
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores,
    'ROC-AUC': roc_auc_scores,
    'Execution Time (s)': execution_times,
    'Best Parameters': best_params_list
})

print("All Models Performance:")
display(results_df)

# 打印特征重要性
print("\n=== Feature Importances ===")
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        print(f"\nModel: {name}")
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)
        display(importance_df.head(20))  # 显示前 10 个重要特征

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import time
import pandas as pd
from joblib import dump, load  # 用于保存和加载模型

# 初始化列表存储分类指标
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
execution_times = []
model_names = []
best_params_list = []  # 存储最佳参数
feature_importances = []  # 存储特征重要性

# 定义分类模型
models = [
    ('RF', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('LightGBM', LGBMClassifier())
]

# 定义超参数网格
param_grids = {
    'RF': {'n_estimators': [10], 'max_depth': [None], 'class_weight': ['balanced']},
    'XGBoost': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'scale_pos_weight': [1, 10, 100]},
    'LightGBM': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'is_unbalance': [True, False]}
}

# 主循环
for name, classifier in models:
    start_time = time.time()

    # 超参数调优
    if param_grids.get(name):
        grid_search = GridSearchCV(classifier, param_grid=param_grids[name], cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_  # 获取最佳参数
    else:
        best_model = classifier
        best_model.fit(X_train, y_train)
        best_params = {}  # 如果没有调优，最佳参数为空字典

    # 预测
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)

    # 记录执行时间
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    # 记录模型名称和最佳参数
    model_names.append(name)
    best_params_list.append(best_params)

    # 获取特征重要性
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importances.append(importances)
    elif hasattr(best_model, 'coef_'):
        importances = best_model.coef_[0]  # 对于线性模型，取第一个类的系数
        feature_importances.append(importances)
    else:
        feature_importances.append(None)

    # 保存模型和特征重要性
    if name == 'RF':
        dump(best_model, 'rf_model.joblib')
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[-1]
        }).sort_values('Importance', ascending=False)
        importance_df.to_csv('D:/code/junma/600000/0424/rf_feature_importance.csv', index=False)  # 保存特征重要性到CSV文件

# 保存结果
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores,
    'ROC-AUC': roc_auc_scores,
    'Execution Time (s)': execution_times,
    'Best Parameters': best_params_list
})

results_df.to_csv('D:/code/junma/600000/0424/model_performance.csv', index=False)  # 保存模型性能结果到CSV文件

print("All Models Performance:")
display(results_df)

# 打印特征重要性
print("\n=== Feature Importances ===")
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        print(f"\nModel: {name}")
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)
        display(importance_df.head(20))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import time
import pandas as pd
from joblib import dump, load  # 用于保存和加载模型
import matplotlib.pyplot as plt
import seaborn as sns

# 设置绘图样式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.dpi': 300,
    'savefig.dpi': 300,
})

# 初始化列表存储分类指标
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
execution_times = []
model_names = []
best_params_list = []  # 存储最佳参数
feature_importances = []  # 存储特征重要性

# 定义分类模型
models = [
    ('RF', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('LightGBM', LGBMClassifier())
]

# 定义超参数网格
param_grids = {
    'RF': {'n_estimators': [10], 'max_depth': [None], 'class_weight': ['balanced']},
    'XGBoost': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'scale_pos_weight': [1, 10, 100]},
    'LightGBM': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'is_unbalance': [True, False]}
}

# 尝试加载优化后的RF模型
try:
    print("尝试加载优化后的RF模型...")
    optimized_model_info = load('rf_balanced_optimized_latest.pkl')
    optimized_rf_model = optimized_model_info['model']
    rf_best_params = optimized_model_info['best_params']
    print("优化后的RF模型加载成功！")
    print("最佳参数:", rf_best_params)

    # 使用优化后的RF模型进行预测和评估
    start_time = time.time()

    # 预测
    y_pred_optimized = optimized_rf_model.predict(X_test)
    y_pred_proba_optimized = optimized_rf_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy_opt = accuracy_score(y_test, y_pred_optimized)
    precision_opt = precision_score(y_test, y_pred_optimized)
    recall_opt = recall_score(y_test, y_pred_optimized)
    f1_opt = f1_score(y_test, y_pred_optimized)
    roc_auc_opt = roc_auc_score(y_test, y_pred_proba_optimized)

    execution_time_opt = time.time() - start_time

    # 存储优化模型的指标
    model_names.append('RF_Optimized')
    accuracy_scores.append(accuracy_opt)
    precision_scores.append(precision_opt)
    recall_scores.append(recall_opt)
    f1_scores.append(f1_opt)
    roc_auc_scores.append(roc_auc_opt)
    execution_times.append(execution_time_opt)
    best_params_list.append(rf_best_params)

    # 获取优化模型的特征重要性
    feature_importances_opt = optimized_rf_model.feature_importances_
    feature_importances.append(feature_importances_opt)

    print("优化后的RF模型评估完成！")

except FileNotFoundError:
    print("优化后的RF模型文件未找到，将使用标准RF模型...")
    optimized_rf_model = None

# 主循环 - 训练其他模型
for name, classifier in models:
    # 如果是RF模型且已经加载了优化版本，则跳过标准训练
    if name == 'RF' and optimized_rf_model is not None:
        print("跳过标准RF模型训练，使用优化版本...")
        continue

    start_time = time.time()

    # 超参数调优
    if param_grids.get(name):
        print(f"正在对{name}进行超参数调优...")
        grid_search = GridSearchCV(classifier, param_grid=param_grids[name], cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_  # 获取最佳参数
    else:
        best_model = classifier
        best_model.fit(X_train, y_train)
        best_params = {}  # 如果没有调优，最佳参数为空字典

    # 预测
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)

    # 记录执行时间
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    # 记录模型名称和最佳参数
    model_names.append(name)
    best_params_list.append(best_params)

    # 获取特征重要性
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importances.append(importances)
    elif hasattr(best_model, 'coef_'):
        importances = best_model.coef_[0]  # 对于线性模型，取第一个类的系数
        feature_importances.append(importances)
    else:
        feature_importances.append(None)

    print(f"{name}模型训练和评估完成！")

# 保存结果
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores,
    'ROC-AUC': roc_auc_scores,
    'Execution Time (s)': execution_times,
    'Best Parameters': best_params_list
})

results_df.to_csv('D:/code/junma/600000/0424/model_performance.csv', index=False)  # 保存模型性能结果到CSV文件

print("\n所有模型性能比较:")
display(results_df)

# 特征重要性分析和可视化
print("\n" + "="*80)
print("特征重要性分析")
print("="*80)

# 为每个模型创建特征重要性分析
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        print(f"\n--- {name} 模型特征重要性 ---")

        # 创建特征重要性DataFrame
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)

        # 显示前20个最重要的特征
        print(f"前20个最重要特征:")
        display(importance_df.head(20))

        # 保存特征重要性到CSV
        csv_filename = f'D:/code/junma/600000/0424/{name.lower()}_feature_importance.csv'
        importance_df.to_csv(csv_filename, index=False)
        print(f"特征重要性已保存到: {csv_filename}")

        # 创建特征重要性可视化
        plt.figure(figsize=(12, 8))
        top_features = importance_df.head(15)

        # 水平条形图
        plt.barh(range(len(top_features)), top_features['Importance'],
                color='skyblue', edgecolor='black', linewidth=1.2)
        plt.yticks(range(len(top_features)), top_features['Feature'])
        plt.xlabel('特征重要性', fontweight='bold', fontsize=12)
        plt.title(f'{name}模型 - 前15个最重要特征', fontweight='bold', fontsize=14)
        plt.gca().invert_yaxis()  # 最重要的特征在顶部

        # 添加数值标签
        for j, v in enumerate(top_features['Importance']):
            plt.text(v + 0.001, j, f'{v:.4f}', va='center', fontweight='bold')

        plt.tight_layout()

        # 保存图像
        img_filename = f'D:/code/junma/600000/0424/{name.lower()}_feature_importance.png'
        plt.savefig(img_filename, dpi=300, bbox_inches='tight')
        print(f"特征重要性图已保存到: {img_filename}")
        plt.show()

# 创建所有模型特征重要性比较（如果特征重要性可用）
print("\n--- 所有模型特征重要性比较 ---")

# 找出所有模型共同的重要特征
common_important_features = {}
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)

        # 获取每个模型的前10个重要特征
        top_features = importance_df.head(10)['Feature'].tolist()
        common_important_features[name] = set(top_features)

        print(f"{name} 前10重要特征: {top_features}")

# 找出共同的重要特征
if len(common_important_features) > 1:
    common_features = set.intersection(*common_important_features.values())
    print(f"\n所有模型共同的重要特征: {list(common_features)}")

# 创建特征重要性比较热图（如果多个模型都有特征重要性）
models_with_importance = [name for i, name in enumerate(model_names)
                         if feature_importances[i] is not None]

if len(models_with_importance) >= 2:
    print(f"\n创建特征重要性比较热图...")

    # 选择前15个特征（基于第一个模型的重要性）
    first_model_idx = model_names.index(models_with_importance[0])
    top_features_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importances[first_model_idx]
    }).sort_values('Importance', ascending=False).head(15)
    top_features = top_features_df['Feature'].tolist()

    # 创建比较DataFrame
    comparison_data = []
    for feature in top_features:
        row = {'Feature': feature}
        for model_name in models_with_importance:
            model_idx = model_names.index(model_name)
            feature_idx = list(X_train.columns).index(feature)
            importance_value = feature_importances[model_idx][feature_idx]
            row[model_name] = importance_value
        comparison_data.append(row)

    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.set_index('Feature')

    # 创建热图
    plt.figure(figsize=(12, 10))
    sns.heatmap(comparison_df, annot=True, cmap='YlOrRd', fmt='.4f',
                linewidths=1, linecolor='black', cbar_kws={'label': '特征重要性'})
    plt.title('不同模型间特征重要性比较', fontweight='bold', fontsize=16, pad=20)
    plt.xlabel('模型', fontweight='bold', fontsize=12)
    plt.ylabel('特征', fontweight='bold', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()

    # 保存热图
    heatmap_filename = 'D:/code/junma/600000/0424/feature_importance_comparison_heatmap.png'
    plt.savefig(heatmap_filename, dpi=300, bbox_inches='tight')
    print(f"特征重要性比较热图已保存到: {heatmap_filename}")
    plt.show()

    # 保存比较数据
    comparison_df.to_csv('D:/code/junma/600000/0424/feature_importance_comparison.csv')
    print("特征重要性比较数据已保存到CSV文件")

print("\n所有模型训练和特征重要性分析完成！")

In [None]:
# 加载优化后的模型进行最终预测
try:
    print("加载优化后的模型进行最终预测...")

    # 尝试加载优化后的模型
    optimized_model_info = joblib.load('rf_balanced_optimized_latest.pkl')
    final_model = optimized_model_info['model']
    best_params = optimized_model_info['best_params']

    print("优化模型加载成功！")
    print(f"使用的最佳参数: {best_params}")

    # 使用优化后的模型进行预测
    y_final_pred = final_model.predict(X_test)
    y_final_pred_proba = final_model.predict_proba(X_test)[:, 1]  # 预测概率

    # 确保数据类型一致
    final_y_pred = y_final_pred.astype(int)
    final_y_test = y_test.astype(int)

    # 创建完整的预测结果DataFrame
    results_full = pd.DataFrame({
        'Predicted_Value': final_y_pred,
        'True_Value': final_y_test,
        'Prediction_Probability': y_final_pred_proba,
        'Prediction_Correct': final_y_pred == final_y_test
    })

    # 添加预测状态描述
    results_full['Prediction_Status'] = results_full['Prediction_Correct'].map({
        True: '正确预测',
        False: '错误预测'
    })

    # 添加类别描述
    results_full['True_Label'] = results_full['True_Value'].map({0: '正常', 1: '断纱'})
    results_full['Predicted_Label'] = results_full['Predicted_Value'].map({0: '正常', 1: '断纱'})

    print("\n" + "="*80)
    print("优化模型预测结果总览")
    print("="*80)

    # 总体统计
    total_samples = len(results_full)
    correct_predictions = results_full['Prediction_Correct'].sum()
    overall_accuracy = correct_predictions / total_samples

    print(f"总样本数: {total_samples}")
    print(f"正确预测数: {correct_predictions}")
    print(f"总体准确率: {overall_accuracy:.4f} ({overall_accuracy:.2%})")

    # 1. 断纱预测结果分析 (预测为1的样本)
    print("\n" + "="*80)
    print("断纱预测结果分析 (预测值 = 1)")
    print("="*80)

    results_break = results_full[results_full['Predicted_Value'] == 1].copy()
    results_break['Difference'] = results_break['True_Value'] - results_break['Predicted_Value']

    break_total = len(results_break)
    break_correct = (results_break['True_Value'] == 1).sum()
    break_incorrect = (results_break['True_Value'] == 0).sum()
    break_accuracy = break_correct / break_total if break_total > 0 else 0

    print(f"预测为断纱的样本总数: {break_total}")
    print(f"其中实际为断纱(正确预测): {break_correct}")
    print(f"其中实际为正常(错误预测 - 误报): {break_incorrect}")
    print(f"断纱预测准确率: {break_accuracy:.4f} ({break_accuracy:.2%})")

    # 显示断纱预测的前后各5行
    print(f"\n前5个断纱预测样本:")
    break_display_cols = ['Predicted_Label', 'True_Label', 'Prediction_Probability', 'Prediction_Status']
    display(results_break[break_display_cols].head())

    print(f"\n后5个断纱预测样本:")
    display(results_break[break_display_cols].tail())

    # 2. 正常预测结果分析 (预测为0的样本)
    print("\n" + "="*80)
    print("正常预测结果分析 (预测值 = 0)")
    print("="*80)

    results_normal = results_full[results_full['Predicted_Value'] == 0].copy()
    results_normal['Difference'] = results_normal['True_Value'] - results_normal['Predicted_Value']

    normal_total = len(results_normal)
    normal_correct = (results_normal['True_Value'] == 0).sum()
    normal_incorrect = (results_normal['True_Value'] == 1).sum()
    normal_accuracy = normal_correct / normal_total if normal_total > 0 else 0

    print(f"预测为正常的样本总数: {normal_total}")
    print(f"其中实际为正常(正确预测): {normal_correct}")
    print(f"其中实际为断纱(错误预测 - 漏报): {normal_incorrect}")
    print(f"正常预测准确率: {normal_accuracy:.4f} ({normal_accuracy:.2%})")

    # 显示正常预测的前后各5行
    print(f"\n前5个正常预测样本:")
    normal_display_cols = ['Predicted_Label', 'True_Label', 'Prediction_Probability', 'Prediction_Status']
    display(results_normal[normal_display_cols].head())

    print(f"\n后5个正常预测样本:")
    display(results_normal[normal_display_cols].tail())

    # 3. 详细错误分析
    print("\n" + "="*80)
    print("详细错误分析")
    print("="*80)

    # 误报分析 (预测为1但实际为0)
    false_positives = results_break[results_break['True_Value'] == 0]
    print(f"误报数量 (预测断纱但实际正常): {len(false_positives)}")
    if len(false_positives) > 0:
        print("误报样本详情:")
        display(false_positives[['Prediction_Probability', 'True_Label', 'Predicted_Label']].head(10))

    # 漏报分析 (预测为0但实际为1)
    false_negatives = results_normal[results_normal['True_Value'] == 1]
    print(f"\n漏报数量 (预测正常但实际断纱): {len(false_negatives)}")
    if len(false_negatives) > 0:
        print("漏报样本详情:")
        display(false_negatives[['Prediction_Probability', 'True_Label', 'Predicted_Label']].head(10))

    # 4. 预测概率分布分析
    print("\n" + "="*80)
    print("预测概率分布分析")
    print("="*80)

    # 正确预测的概率分布
    correct_probabilities = results_full[results_full['Prediction_Correct'] == True]['Prediction_Probability']
    incorrect_probabilities = results_full[results_full['Prediction_Correct'] == False]['Prediction_Probability']

    print(f"正确预测的平均概率: {correct_probabilities.mean():.4f}")
    print(f"错误预测的平均概率: {incorrect_probabilities.mean():.4f}")
    print(f"正确预测的概率标准差: {correct_probabilities.std():.4f}")
    print(f"错误预测的概率标准差: {incorrect_probabilities.std():.4f}")

    # 按类别统计概率
    for true_class in [0, 1]:
        class_data = results_full[results_full['True_Value'] == true_class]
        class_name = '正常' if true_class == 0 else '断纱'
        print(f"\n{class_name}样本的预测概率统计:")
        print(f"  平均概率: {class_data['Prediction_Probability'].mean():.4f}")
        print(f"  概率中位数: {class_data['Prediction_Probability'].median():.4f}")
        print(f"  概率标准差: {class_data['Prediction_Probability'].std():.4f}")

    # 5. 保存详细结果
    print("\n" + "="*80)
    print("保存预测结果")
    print("="*80)

    # 保存完整结果
    results_full.to_csv('optimized_model_complete_predictions.csv', index=False)
    print("完整预测结果已保存至: optimized_model_complete_predictions.csv")

    # 保存断纱预测结果
    results_break.to_csv('optimized_model_break_predictions.csv', index=False)
    print("断纱预测结果已保存至: optimized_model_break_predictions.csv")

    # 保存正常预测结果
    results_normal.to_csv('optimized_model_normal_predictions.csv', index=False)
    print("正常预测结果已保存至: optimized_model_normal_predictions.csv")

    # 保存错误分析结果
    error_analysis = pd.concat([false_positives, false_negatives])
    if len(error_analysis) > 0:
        error_analysis.to_csv('optimized_model_error_analysis.csv', index=False)
        print("错误分析结果已保存至: optimized_model_error_analysis.csv")

    # 6. 生成预测结果汇总报告
    print("\n" + "="*80)
    print("预测结果汇总报告")
    print("="*80)

    summary_report = {
        '总样本数': total_samples,
        '正确预测数': correct_predictions,
        '总体准确率': f"{overall_accuracy:.4f} ({overall_accuracy:.2%})",
        '断纱预测总数': break_total,
        '断纱正确预测数': break_correct,
        '断纱误报数': break_incorrect,
        '断纱预测准确率': f"{break_accuracy:.4f} ({break_accuracy:.2%})",
        '正常预测总数': normal_total,
        '正常正确预测数': normal_correct,
        '正常漏报数': normal_incorrect,
        '正常预测准确率': f"{normal_accuracy:.4f} ({normal_accuracy:.2%})",
        '误报率 (False Positive Rate)': f"{break_incorrect/break_total:.4f} ({break_incorrect/break_total:.2%})" if break_total > 0 else "N/A",
        '漏报率 (False Negative Rate)': f"{normal_incorrect/normal_total:.4f} ({normal_incorrect/normal_total:.2%})" if normal_total > 0 else "N/A",
        '使用模型': 'RF_Optimized',
        '最佳参数': str(best_params)
    }

    summary_df = pd.DataFrame(list(summary_report.items()), columns=['指标', '值'])
    print("预测结果汇总:")
    display(summary_df)

    # 保存汇总报告
    summary_df.to_csv('optimized_model_prediction_summary.csv', index=False)
    print("\n预测汇总报告已保存至: optimized_model_prediction_summary.csv")

    # 7. 可视化预测结果 (可选)
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        print("\n生成预测结果可视化...")

        # 设置绘图样式
        plt.style.use('default')
        plt.rcParams.update({
            'font.family': 'Arial',
            'font.weight': 'bold',
            'axes.labelweight': 'bold',
            'axes.titleweight': 'bold',
            'font.size': 10
        })

        # 创建预测结果分布图
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # 1. 预测概率分布
        axes[0,0].hist(results_full['Prediction_Probability'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
        axes[0,0].set_xlabel('预测概率')
        axes[0,0].set_ylabel('频数')
        axes[0,0].set_title('预测概率分布')
        axes[0,0].grid(True, alpha=0.3)

        # 2. 正确与错误预测的概率分布
        axes[0,1].hist(correct_probabilities, bins=30, alpha=0.7, label='正确预测', color='green')
        axes[0,1].hist(incorrect_probabilities, bins=30, alpha=0.7, label='错误预测', color='red')
        axes[0,1].set_xlabel('预测概率')
        axes[0,1].set_ylabel('频数')
        axes[0,1].set_title('正确vs错误预测的概率分布')
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)

        # 3. 类别分布饼图
        prediction_counts = results_full['Predicted_Label'].value_counts()
        axes[1,0].pie(prediction_counts.values, labels=prediction_counts.index, autopct='%1.1f%%', startangle=90)
        axes[1,0].set_title('预测类别分布')

        # 4. 准确率条形图
        accuracy_data = [overall_accuracy, break_accuracy, normal_accuracy]
        accuracy_labels = ['总体准确率', '断纱预测准确率', '正常预测准确率']
        bars = axes[1,1].bar(accuracy_labels, accuracy_data, color=['blue', 'red', 'green'])
        axes[1,1].set_ylabel('准确率')
        axes[1,1].set_title('各类别预测准确率')
        axes[1,1].set_ylim(0, 1)

        # 在条形图上添加数值标签
        for bar, acc in zip(bars, accuracy_data):
            height = bar.get_height()
            axes[1,1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                          f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

        plt.tight_layout()
        plt.savefig('optimized_model_prediction_analysis.png', dpi=300, bbox_inches='tight')
        print("预测分析图已保存至: optimized_model_prediction_analysis.png")
        plt.show()

    except ImportError:
        print("Matplotlib/Seaborn 不可用，跳过可视化部分")

    print("\n优化模型预测分析完成！")

except FileNotFoundError:
    print("错误: 优化模型文件 'rf_balanced_optimized_latest.pkl' 未找到")
    print("请先运行模型优化代码")

except Exception as e:
    print(f"预测过程中发生错误: {e}")
    import traceback
    traceback.print_exc()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import time
import pandas as pd
from joblib import dump, load  # 用于保存和加载模型
import matplotlib.pyplot as plt
import seaborn as sns

# 设置绘图样式
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.linewidth': 2.5,
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.dpi': 300,
    'savefig.dpi': 300,
})

# 初始化列表存储分类指标
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
execution_times = []
model_names = []
best_params_list = []  # 存储最佳参数
feature_importances = []  # 存储特征重要性

# 定义分类模型
models = [
    ('RF', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('LightGBM', LGBMClassifier())
]

# 定义超参数网格
param_grids = {
    'RF': {'n_estimators': [10], 'max_depth': [None], 'class_weight': ['balanced']},
    'XGBoost': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'scale_pos_weight': [1, 10, 100]},
    'LightGBM': {'n_estimators': [10, 30, 50, 70, 100], 'learning_rate': [0.005, 0.01, 0.05, 0.1], 'is_unbalance': [True, False]}
}

# 尝试加载优化后的RF模型
try:
    print("尝试加载优化后的RF模型...")
    optimized_model_info = load('rf_balanced_optimized_latest.pkl')
    optimized_rf_model = optimized_model_info['model']
    rf_best_params = optimized_model_info['best_params']
    print("优化后的RF模型加载成功！")
    print("最佳参数:", rf_best_params)

    # 使用优化后的RF模型进行预测和评估
    start_time = time.time()

    # 预测
    y_pred_optimized = optimized_rf_model.predict(X_test)
    y_pred_proba_optimized = optimized_rf_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy_opt = accuracy_score(y_test, y_pred_optimized)
    precision_opt = precision_score(y_test, y_pred_optimized)
    recall_opt = recall_score(y_test, y_pred_optimized)
    f1_opt = f1_score(y_test, y_pred_optimized)
    roc_auc_opt = roc_auc_score(y_test, y_pred_proba_optimized)

    execution_time_opt = time.time() - start_time

    # 存储优化模型的指标
    model_names.append('RF_Optimized')
    accuracy_scores.append(accuracy_opt)
    precision_scores.append(precision_opt)
    recall_scores.append(recall_opt)
    f1_scores.append(f1_opt)
    roc_auc_scores.append(roc_auc_opt)
    execution_times.append(execution_time_opt)
    best_params_list.append(rf_best_params)

    # 获取优化模型的特征重要性
    feature_importances_opt = optimized_rf_model.feature_importances_
    feature_importances.append(feature_importances_opt)

    print("优化后的RF模型评估完成！")

except FileNotFoundError:
    print("优化后的RF模型文件未找到，将使用标准RF模型...")
    optimized_rf_model = None

# 主循环 - 训练其他模型
for name, classifier in models:
    # 如果是RF模型且已经加载了优化版本，则跳过标准训练
    if name == 'RF' and optimized_rf_model is not None:
        print("跳过标准RF模型训练，使用优化版本...")
        continue

    start_time = time.time()

    # 超参数调优
    if param_grids.get(name):
        print(f"正在对{name}进行超参数调优...")
        grid_search = GridSearchCV(classifier, param_grid=param_grids[name], cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_  # 获取最佳参数
    else:
        best_model = classifier
        best_model.fit(X_train, y_train)
        best_params = {}  # 如果没有调优，最佳参数为空字典

    # 预测
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)

    # 记录执行时间
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    # 记录模型名称和最佳参数
    model_names.append(name)
    best_params_list.append(best_params)

    # 获取特征重要性
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importances.append(importances)
    elif hasattr(best_model, 'coef_'):
        importances = best_model.coef_[0]  # 对于线性模型，取第一个类的系数
        feature_importances.append(importances)
    else:
        feature_importances.append(None)

    print(f"{name}模型训练和评估完成！")

# 保存结果
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores,
    'ROC-AUC': roc_auc_scores,
    'Execution Time (s)': execution_times,
    'Best Parameters': best_params_list
})

results_df.to_csv('D:/code/junma/600000/0424/model_performance.csv', index=False)  # 保存模型性能结果到CSV文件

print("\n所有模型性能比较:")
display(results_df)

# 特征重要性分析和可视化
print("\n" + "="*80)
print("特征重要性分析")
print("="*80)

# 为每个模型创建特征重要性分析
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        print(f"\n--- {name} 模型特征重要性 ---")

        # 创建特征重要性DataFrame
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)

        # 显示前20个最重要的特征
        print(f"前20个最重要特征:")
        display(importance_df.head(20))

        # 保存特征重要性到CSV
        csv_filename = f'D:/code/junma/600000/0424/{name.lower()}_feature_importance.csv'
        importance_df.to_csv(csv_filename, index=False)
        print(f"特征重要性已保存到: {csv_filename}")

        # 创建特征重要性可视化
        plt.figure(figsize=(12, 8))
        top_features = importance_df.head(15)

        # 水平条形图
        plt.barh(range(len(top_features)), top_features['Importance'],
                color='skyblue', edgecolor='black', linewidth=1.2)
        plt.yticks(range(len(top_features)), top_features['Feature'])
        plt.xlabel('特征重要性', fontweight='bold', fontsize=12)
        plt.title(f'{name}模型 - 前15个最重要特征', fontweight='bold', fontsize=14)
        plt.gca().invert_yaxis()  # 最重要的特征在顶部

        # 添加数值标签
        for j, v in enumerate(top_features['Importance']):
            plt.text(v + 0.001, j, f'{v:.4f}', va='center', fontweight='bold')

        plt.tight_layout()

        # 保存图像
        img_filename = f'D:/code/junma/600000/0424/{name.lower()}_feature_importance.png'
        plt.savefig(img_filename, dpi=300, bbox_inches='tight')
        print(f"特征重要性图已保存到: {img_filename}")
        plt.show()

# 创建所有模型特征重要性比较（如果特征重要性可用）
print("\n--- 所有模型特征重要性比较 ---")

# 找出所有模型共同的重要特征
common_important_features = {}
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)

        # 获取每个模型的前10个重要特征
        top_features = importance_df.head(10)['Feature'].tolist()
        common_important_features[name] = set(top_features)

        print(f"{name} 前10重要特征: {top_features}")

# 找出共同的重要特征
if len(common_important_features) > 1:
    common_features = set.intersection(*common_important_features.values())
    print(f"\n所有模型共同的重要特征: {list(common_features)}")

# 创建特征重要性比较热图（如果多个模型都有特征重要性）
models_with_importance = [name for i, name in enumerate(model_names)
                         if feature_importances[i] is not None]

if len(models_with_importance) >= 2:
    print(f"\n创建特征重要性比较热图...")

    # 选择前15个特征（基于第一个模型的重要性）
    first_model_idx = model_names.index(models_with_importance[0])
    top_features_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importances[first_model_idx]
    }).sort_values('Importance', ascending=False).head(15)
    top_features = top_features_df['Feature'].tolist()

    # 创建比较DataFrame
    comparison_data = []
    for feature in top_features:
        row = {'Feature': feature}
        for model_name in models_with_importance:
            model_idx = model_names.index(model_name)
            feature_idx = list(X_train.columns).index(feature)
            importance_value = feature_importances[model_idx][feature_idx]
            row[model_name] = importance_value
        comparison_data.append(row)

    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.set_index('Feature')

    # 创建热图
    plt.figure(figsize=(12, 10))
    sns.heatmap(comparison_df, annot=True, cmap='YlOrRd', fmt='.4f',
                linewidths=1, linecolor='black', cbar_kws={'label': '特征重要性'})
    plt.title('不同模型间特征重要性比较', fontweight='bold', fontsize=16, pad=20)
    plt.xlabel('模型', fontweight='bold', fontsize=12)
    plt.ylabel('特征', fontweight='bold', fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()

    # 保存热图
    heatmap_filename = 'D:/code/junma/600000/0424/feature_importance_comparison_heatmap.png'
    plt.savefig(heatmap_filename, dpi=300, bbox_inches='tight')
    print(f"特征重要性比较热图已保存到: {heatmap_filename}")
    plt.show()

    # 保存比较数据
    comparison_df.to_csv('D:/code/junma/600000/0424/feature_importance_comparison.csv')
    print("特征重要性比较数据已保存到CSV文件")

print("\n所有模型训练和特征重要性分析完成！")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix)
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

# 初始化列表存储分类指标
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
execution_times = []
model_names = []
best_params_list = []
feature_importances = []
confusion_matrices = []  # 存储混淆矩阵

# 定义分类模型
models = [
    ('RF', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('LightGBM', LGBMClassifier())
]

# 定义超参数网格
param_grids = {
    'RF': {'n_estimators': [10], 'max_depth': [20], 'class_weight': ['balanced']},
    'XGBoost': {'n_estimators': [10, 30, 50, 70, 100],
                'learning_rate': [0.005, 0.01, 0.05, 0.1],
                'scale_pos_weight': [1, 10, 100]},
    'LightGBM': {'n_estimators': [10, 30, 50, 70, 100],
                 'learning_rate': [0.005, 0.01, 0.05, 0.1],
                 'is_unbalance': [True, False]}
}

# 主循环
for name, classifier in models:
    start_time = time.time()

    print(f"\n=== Training {name} ===")

    # 超参数调优
    if param_grids.get(name):
        grid_search = GridSearchCV(classifier, param_grid=param_grids[name],
                                  cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"Best parameters: {best_params}")
    else:
        best_model = classifier
        best_model.fit(X_train, y_train)
        best_params = {}

    # 预测
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # 计算分类指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # 计算混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)

    # 存储结果
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)
    execution_times.append(time.time() - start_time)
    model_names.append(name)
    best_params_list.append(best_params)

    # 获取特征重要性
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importances.append(importances)
    elif hasattr(best_model, 'coef_'):
        importances = best_model.coef_[0]
        feature_importances.append(importances)
    else:
        feature_importances.append(None)

    # 保存模型
    dump(best_model, f'{name.lower()}_model.joblib')

# 保存结果
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores,
    'ROC-AUC': roc_auc_scores,
    'Execution Time (s)': execution_times,
    'Best Parameters': best_params_list
})

# 打印结果
print("\n=== Model Performance Comparison ===")
display(results_df)

# 可视化混淆矩阵
print("\n=== Confusion Matrices ===")
plt.figure(figsize=(15, 5))
for i, (name, cm) in enumerate(zip(model_names, confusion_matrices)):
    plt.subplot(1, 3, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
plt.tight_layout()
plt.show()

# 打印特征重要性
print("\n=== Feature Importances ===")
for i, name in enumerate(model_names):
    if feature_importances[i] is not None:
        print(f"\nModel: {name}")
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances[i]
        }).sort_values('Importance', ascending=False)

        # 可视化前10个重要特征
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature',
                   data=importance_df.head(20), palette='viridis')
        plt.title(f'{name} - Top 20 Important Features')
        plt.show()

        display(importance_df.head(20))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, mean_squared_error

# 设置随机种子保证可重复性
np.random.seed(42)

# 1. 准备交叉验证
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# 2. 初始化存储特征重要性的数组
feature_importances = np.zeros((n_folds, X_train.shape[1]))
fold_scores = []

# 3. 执行交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Processing fold {fold + 1}/{n_folds}")

    # 分割数据
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # 使用明确的随机森林参数
    rf_params = {
        'n_estimators': 100,
        'max_depth': None,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'random_state': 42
    }
    rf = RandomForestRegressor(**rf_params)
    rf.fit(X_tr, y_tr)

    # 存储特征重要性
    feature_importances[fold] = rf.feature_importances_

    # 计算验证集分数
    val_pred = rf.predict(X_val)
    fold_score = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_scores.append(fold_score)
    print(f"Fold {fold + 1} RMSE: {fold_score:.4f}")

# 4. 计算统计量
mean_importance = feature_importances.mean(axis=0)
std_importance = feature_importances.std(axis=0)
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\nAverage CV RMSE: {cv_score:.4f} (±{cv_std:.4f})")

# 5. 创建特征重要性DataFrame
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Mean Importance': mean_importance,
    'Std Importance': std_importance,
    'CV Rank': np.argsort(mean_importance)[::-1] + 1
}).sort_values('Mean Importance', ascending=False)

# 6. 可视化特征重要性（带标准差）
plt.figure(figsize=(12, 8))
top_features = importance_df.head(20)
sns.barplot(x='Mean Importance', y='Feature',
            data=top_features,
            palette='viridis',
            hue='Feature',  # 添加hue参数避免警告
            legend=False)   # 不显示图例
# 手动添加误差条
for i, (_, row) in enumerate(top_features.iterrows()):
    plt.errorbar(x=row['Mean Importance'], y=i,
                 xerr=row['Std Importance'],
                 color='black', capsize=3)
plt.title(f'Top 20 Feature Importances with {n_folds}-Fold CV Standard Deviation\n(Random Forest)')
plt.xlabel('Mean Importance ± Std Dev')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 7. 可视化特征重要性热图（所有折叠）
plt.figure(figsize=(14, 20))
sns.heatmap(feature_importances.T,
            cmap='viridis',
            yticklabels=X_train.columns,
            xticklabels=[f'Fold {i+1}' for i in range(n_folds)])
plt.title('Feature Importances Across All CV Folds')
plt.xlabel('CV Fold')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 8. 输出稳定性分析结果
print("\n=== Feature Importance Stability Analysis ===")
print("Top 20 Most Important Features:")
display(importance_df.head(20))

print("\nFeatures with Highest Variability (Std Dev > Mean):")
high_var_features = importance_df[importance_df['Std Importance'] > importance_df['Mean Importance']]
if not high_var_features.empty:
    display(high_var_features)
else:
    print("No features with std > mean importance")

# 9. 保存完整结果
importance_df.to_csv('random_forest_feature_importance_cv_results.csv', index=False)

# 10. 交叉验证性能分布可视化
plt.figure(figsize=(20, 6))
sns.boxplot(x=fold_scores)
plt.title(f'RMSE Distribution Across {n_folds}-Fold CV\nMean: {cv_score:.4f} (±{cv_std:.4f})')
plt.xlabel('RMSE')
plt.show()

In [None]:

# 5. Analysis and Visualization ==========================================
print("\nLHS Experiment Results:")
print(experiment_df.head())

# Basic statistics
print("\nBreakage Rate Statistics:")
print(experiment_df['breakage_rate'].describe())

# Plotting
plt.figure(figsize=(10, 6))
plt.hist(experiment_df['breakage_rate'], bins=20, edgecolor='k')
plt.title('Distribution of Predicted Breakage Rates')
plt.xlabel('Breakage Rate')
plt.ylabel('Frequency')
plt.show()

# Pairplot for selected parameters
plot_params = ['V_dw1', 'P_dw1', 'R_dw21', 'breakage_rate']
sns.pairplot(experiment_df[plot_params], diag_kind='kde')
plt.suptitle('Parameter Relationships with Breakage Rate', y=1.02)
plt.show()


# 4. 帕累托前沿分析 =====================================================

# Find optimal solution (minimum breakage rate)
optimal_idx = experiment_df['breakage_rate'].idxmin()
optimal_params = experiment_df.loc[optimal_idx].to_dict()
optimal_rate = optimal_params.pop('breakage_rate')

# Calculate gaps between fixed combinations and optimal solution
for i, result in enumerate(fixed_results):
    gap = result['breakage_rate'] - optimal_rate
    print(f"组合{i+1}与最优解的断纱率差距: {gap:.4f}")

# Plot Pareto front (visualizing two parameters)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=experiment_df, x='R_dw22', y='breakage_rate', hue='R_dw21')
for i, combo in enumerate(fixed_combinations):
    plt.scatter(combo['R_dw22'], fixed_results[i]['breakage_rate'],
               s=200, marker='*', label=f'固定组合{i+1}')
plt.scatter(optimal_params['R_dw22'], optimal_rate,
           s=200, marker='X', c='red', label='最优解')
plt.title('帕累托前沿分析')
plt.xlabel('锭速(rpm)')
plt.ylabel('断纱率')
plt.legend()
plt.show()

# 5. Sobol敏感性分析 ===================================================

# Define Sobol analysis problem
problem = {
    'num_vars': len(param_ranges),
    'names': list(param_ranges.keys()),
    'bounds': [param_ranges[name] for name in param_ranges.keys()]
}

# Generate samples
param_values = saltelli.sample(problem, 512)

# Run model to get outputs
Y = np.array([simulate_breakage_rate(dict(zip(problem['names'], values)))
             for values in param_values])

# Perform Sobol analysis
Si = sobol.analyze(problem, Y)

# Visualize sensitivity results
plt.figure(figsize=(10, 6))
sns.barplot(x=Si['ST'], y=problem['names'])
plt.title('Sobol总效应指数（参数敏感性）')
plt.xlabel('敏感性指数')
plt.ylabel('参数')
plt.show()

# 6. 稳健性评估（蒙特卡洛模拟） ========================================

top3_combinations = experiment_df.nsmallest(3, 'breakage_rate')

# Define noise injection function
def add_noise(value, param_name, noise_level=0.05):
    range_width = param_ranges[param_name][1] - param_ranges[param_name][0]
    noise = np.random.normal(0, noise_level * range_width)
    return max(param_ranges[param_name][0],
              min(param_ranges[param_name][1], value + noise))

# Monte Carlo simulation for TOP3 combinations
n_simulations = 1000
robustness_results = {i: [] for i in range(3)}

for i, (_, combo) in enumerate(top3_combinations.iterrows()):
    for _ in range(n_simulations):
        noisy_params = {param: add_noise(combo[param], param)
                       for param in param_ranges.keys()}
        rate = simulate_breakage_rate(noisy_params)
        robustness_results[i].append(rate)

# Visualize robustness results
plt.figure(figsize=(10, 6))
for i in range(3):
    sns.kdeplot(robustness_results[i], label=f'TOP{i+1}组合')
plt.title('蒙特卡洛稳健性评估（5%噪声）')
plt.xlabel('断纱率')
plt.ylabel('概率密度')
plt.legend()
plt.show()

# 7. 生成验证报告 ======================================================

print("\n=== 验证报告 ===")
print(f"\n最优参数组合: {optimal_params}")
print(f"最优断纱率: {optimal_rate:.4f}")

print("\n固定组合验证结果:")
for i, result in enumerate(fixed_results):
    print(f"组合{i+1}: {result['params']}")
    print(f"断纱率: {result['breakage_rate']:.4f} (与最优解差距: {result['breakage_rate']-optimal_rate:.4f})")

print("\nSobol敏感性分析结果:")
for name, st in zip(problem['names'], Si['ST']):
    print(f"{name}: {st:.4f}")

print("\nTOP3组合稳健性评估:")
for i in range(3):
    mean_rate = np.mean(robustness_results[i])
    std_rate = np.std(robustness_results[i])
    print(f"TOP{i+1}组合 - 平均断纱率: {mean_rate:.4f}, 标准差: {std_rate:.4f}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from joblib import dump, load
import time

plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号

# 加载随机森林模型
best_rf_model = load('rf_model.joblib')

# 获取特征重要性
feature_importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

# 打印特征重要性
print("\n=== Feature Importances for Random Forest ===")
display(feature_importance_df)

# 定义参数范围（根据实际情况调整）
param_ranges = {
    'V_dw1': [1000, 2000],  # 速度1范围
    'P_dw1': [0, 100],      # 满卷率1范围
    'R_dw21': [100, 500],   # 半径1范围
    # 添加更多参数范围
}

# 5. 敏感性分析（基于随机森林特征重要性）
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance_df['Importance'], y=feature_importance_df['Feature'])
plt.title('随机森林特征重要性（敏感性）')
plt.xlabel('重要性指数')
plt.ylabel('参数')
plt.show()

# 6. 稳定性评估（蒙特卡洛模拟）
top3_combinations = experiment_df.nsmallest(3, 'breakage_rate')

# 定义噪声注入函数
def add_noise(value, param_name, noise_level=0.05):
    range_width = param_ranges[param_name][1] - param_ranges[param_name][0]
    noise = np.random.normal(0, noise_level * range_width)
    return max(param_ranges[param_name][0],
              min(param_ranges[param_name][1], value + noise))

# 蒙特卡洛模拟
n_simulations = 1000
robustness_results = {i: [] for i in range(3)}

for i, (_, combo) in enumerate(top3_combinations.iterrows()):
    for _ in range(n_simulations):
        noisy_params = {param: add_noise(combo[param], param)
                       for param in param_ranges.keys()}

        # 转换为模型输入格式
        input_data = np.array([noisy_params[param] for param in X_train.columns]).reshape(1, -1)

        # 预测断纱率
        rate = best_rf_model.predict(input_data)[0]
        robustness_results[i].append(rate)

# 可视化稳健性结果
plt.figure(figsize=(10, 6))
for i in range(3):
    sns.kdeplot(robustness_results[i], label=f'TOP{i+1}组合')
plt.title('蒙特卡洛稳健性评估（5%噪声）')
plt.xlabel('断纱率')
plt.ylabel('概率密度')
plt.legend()
plt.show()

# 生成验证报告
print("\n=== 验证报告 ===")
print(f"\n最优参数组合: {optimal_params}")
print(f"最优断纱率: {optimal_rate:.4f}")

print("\n固定组合验证结果:")
for i, result in enumerate(fixed_results):
    print(f"组合{i+1}: {result['params']}")
    print(f"断纱率: {result['breakage_rate']:.4f} (与最优解差距: {result['breakage_rate']-optimal_rate:.4f})")

print("\n随机森林敏感性分析结果:")
for name, imp in zip(feature_importance_df['Feature'], feature_importance_df['Importance']):
    print(f"{name}: {imp:.4f}")

print("\nTOP3组合稳健性评估:")
for i in range(3):
    mean_rate = np.mean(robustness_results[i])
    std_rate = np.std(robustness_results[i])
    print(f"TOP{i+1}组合 - 平均断纱率: {mean_rate:.4f}, 标准差: {std_rate:.4f}")

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>20 |</span></b> <b>Final Model Predictions and Comparison with True Prices</b></div>

In [None]:
best_global_model


In [None]:
print("Best global model hyperparameters:", best_global_model.get_params())

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from IPython.display import display  # 添加display函数

plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号

# 1. 数据预处理函数
def preprocess_data(df, dw_number=1):
    """处理非数值型列并准备建模数据"""
    # 创建断纱标志列
    target_col = f'is_break_dw{dw_number}'
    df[target_col] = df[f'D_dw{dw_number}'].apply(lambda x: 1 if x == 3 else 0)

    # 删除时间戳等无关列（根据实际数据调整）
    non_numeric_cols = df.select_dtypes(include=['object', 'datetime']).columns
    df_clean = df.drop(columns=non_numeric_cols)

    # 检查并处理剩余的非数值数据
    non_numeric = df_clean.select_dtypes(exclude=['int', 'float', 'bool']).columns
    if len(non_numeric) > 0:
        print(f"警告: 仍有非数值列 {list(non_numeric)}，将尝试自动转换")
        df_clean = pd.get_dummies(df_clean, columns=non_numeric)

    # 分离特征和目标变量
    X = df_clean.drop(columns=[target_col])
    y = df_clean[target_col]

    return X, y, target_col

# 2. 训练最佳模型
def train_best_model(X, y):
    """训练并返回最佳模型"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    model.fit(X_train, y_train)

    # 评估模型
    y_pred = model.predict(X_test)
    print("\n模型评估报告:")
    print(classification_report(y_test, y_pred))

    return model

# 3. 分析锭速偏离阈值
def analyze_speed_threshold(model, data, speed_col, reference_col, target_col):
    """分析锭速偏离的安全阈值"""
    results = []
    speed_changes = np.arange(-30, 30.5, 0.5)  # -30%到+30%

    # 确保只使用数值列
    numeric_cols = data.select_dtypes(include=['int', 'float', 'bool']).columns
    data = data[numeric_cols]

    for change in tqdm(speed_changes, desc="分析速度偏离"):
        modified_data = data.copy()
        modified_data[speed_col] = modified_data[reference_col] * (1 + change/100)

        # 预测断纱概率
        proba = model.predict_proba(modified_data.drop(columns=[target_col]))[:, 1]
        breakage_rate = np.mean(proba)

        results.append({
            '速度变化(%)': change,
            '断纱率(%)': breakage_rate * 100
        })

    results_df = pd.DataFrame(results)

    # 寻找断纱率突增的阈值点（使用导数变化）
    results_df['斜率'] = results_df['断纱率(%)'].diff().abs()
    threshold = results_df[results_df['斜率'] > results_df['斜率'].quantile(0.9)]

    # 可视化
    plt.figure(figsize=(10, 5))
    sns.lineplot(data=results_df, x='速度变化(%)', y='断纱率(%)')
    for t in threshold['速度变化(%)']:
        plt.axvline(x=t, color='red', linestyle='--', alpha=0.3)
    plt.title('锭速偏离对断纱率的影响')
    plt.show()

    return results_df, threshold

# 4. 分析满卷率安全阈值
def analyze_fullness_threshold(model, data, fullness_col, target_col):
    """分析满卷率的安全阈值"""
    results = []
    fullness_levels = np.arange(0, 101, 1)  # 0%到100%

    # 确保只使用数值列
    numeric_cols = data.select_dtypes(include=['int', 'float', 'bool']).columns
    data = data[numeric_cols]

    for level in tqdm(fullness_levels, desc="分析满卷率"):
        modified_data = data.copy()
        modified_data[fullness_col] = level

        # 预测断纱概率
        proba = model.predict_proba(modified_data.drop(columns=[target_col]))[:, 1]
        breakage_rate = np.mean(proba)

        results.append({
            '满卷率(%)': level,
            '断纱率(%)': breakage_rate * 100
        })

    results_df = pd.DataFrame(results)

    # 寻找断纱率突增的阈值点
    results_df['斜率'] = results_df['断纱率(%)'].diff().abs()
    threshold = results_df[results_df['斜率'] > results_df['斜率'].quantile(0.9)]

    # 可视化
    plt.figure(figsize=(10, 5))
    sns.lineplot(data=results_df, x='满卷率(%)', y='断纱率(%)')
    for t in threshold['满卷率(%)']:
        plt.axvline(x=t, color='red', linestyle='--', alpha=0.3)
    plt.title('满卷率对断纱率的影响')
    plt.show()

    return results_df, threshold

# 5. 分析不同满卷率下的最佳锭速范围
def analyze_optimal_speed_range(model, data, speed_col, reference_col, fullness_col, target_col):
    """分析不同满卷率下的最佳锭速范围"""
    results = []
    speed_changes = np.arange(-10, 10.5, 0.5)  # -10%到+10%
    fullness_levels = np.arange(10, 101, 10)   # 10%, 20%, ..., 100%

    # 确保只使用数值列
    numeric_cols = data.select_dtypes(include=['int', 'float', 'bool']).columns
    data = data[numeric_cols]

    for level in tqdm(fullness_levels, desc="分析满卷率"):
        for change in speed_changes:
            modified_data = data.copy()
            modified_data[speed_col] = modified_data[reference_col] * (1 + change/100)
            modified_data[fullness_col] = level

            # 预测断纱概率
            proba = model.predict_proba(modified_data.drop(columns=[target_col]))[:, 1]
            breakage_rate = np.mean(proba)

            results.append({
                '满卷率(%)': level,
                '速度变化(%)': change,
                '断纱率(%)': breakage_rate * 100
            })

    results_df = pd.DataFrame(results)

    # 找出每个满卷率下断纱率最低的速度范围
    optimal_ranges = results_df.groupby('满卷率(%)').apply(
        lambda x: x.nsmallest(3, '断纱率(%)')
    ).reset_index(drop=True)

    # 可视化
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=results_df, x='速度变化(%)', y='断纱率(%)', hue='满卷率(%)')
    plt.title('不同满卷率下速度偏离对断纱率的影响')
    plt.axvline(x=0, color='red', linestyle='--', label='标准速度')
    plt.legend(title='满卷率(%)')
    plt.show()

    return results_df, optimal_ranges

# 主程序
if __name__ == "__main__":
    # 加载数据（替换为您的实际数据）
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    # 配置参数（根据实际列名修改）
    dw_number = 1  # 锭号
    speed_col = 'V_dw1'      # 实际速度列
    reference_col = 'R_dw22' # 参考速度列
    fullness_col = 'P_dw1'   # 满卷率列

    # 1. 数据预处理
    print("正在进行数据预处理...")
    X, y, target_col = preprocess_data(df, dw_number=dw_number)

    # 2. 训练模型
    print("\n训练模型中...")
    best_model = train_best_model(X, y)

    # 3. 分析锭速偏离阈值
    print("\n分析锭速偏离阈值...")
    speed_results, speed_threshold = analyze_speed_threshold(
        best_model, X.join(y), speed_col, reference_col, target_col
    )
    print("\n锭速偏离安全阈值:")
    display(speed_threshold)

    # 4. 分析满卷率安全阈值
    print("\n分析满卷率安全阈值...")
    fullness_results, fullness_threshold = analyze_fullness_threshold(
        best_model, X.join(y), fullness_col, target_col
    )
    print("\n满卷率安全阈值:")
    display(fullness_threshold)

    # 5. 分析不同满卷率下的最佳锭速范围
    print("\n分析不同满卷率下的最佳锭速范围...")
    range_results, optimal_ranges = analyze_optimal_speed_range(
        best_model, X.join(y), speed_col, reference_col, fullness_col, target_col
    )
    print("\n各满卷率下的最佳速度范围:")
    display(optimal_ranges)

    # 保存结果
    speed_results.to_csv('speed_deviation_results.csv', index=False)
    fullness_results.to_csv('fullness_threshold_results.csv', index=False)
    range_results.to_csv('optimal_speed_ranges.csv', index=False)
    optimal_ranges.to_csv('recommended_speed_ranges.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression

# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号

# 1. 数据预处理
def preprocess_data(df, dw_number=1):
    """数据预处理函数"""
    # 创建断纱标志列
    target_col = f'is_break_dw{dw_number}'
    df[target_col] = df[f'D_dw{dw_number}'].apply(lambda x: 1 if x == 3 else 0)

    # 计算速度偏移率（当前锭子速度与所有锭子速度中位数的偏差百分比）
    speed_col = f'V_dw{dw_number}'
    reference_speed = df['R_dw22'].median()  # 使用 R_dw22 的中位数作为参考速度
    df['速度偏移率(%)'] = ((df[speed_col] - reference_speed) / reference_speed) * 100

    # 将满卷率和速度偏移率分箱（binning），并保留区间标签
    df['满卷率分箱'], full_bins = pd.cut(df[f'P_dw{dw_number}'], bins=10, retbins=True)
    df['速度偏移率分箱'], speed_bins = pd.cut(df['速度偏移率(%)'], bins=10, retbins=True)

    # 处理设备名称和对应面
    df['设备名称'] = df['name'].apply(lambda x: x.split('-')[0])  # 提取设备名称（如 NX78）
    df['设备面'] = df['subsystem'].apply(lambda x: x.split('-')[1])  # 提取设备面（如 R103）

    return df, full_bins, speed_bins

# 2. 单因素分析
def single_factor_analysis(df, factor_col, target_col, bins=None, factor_name=None):
    """单因素分析函数"""
    # 计算每个区间的断纱率
    factor_break_rate = df.groupby(factor_col)[target_col].agg(['mean', 'count']).reset_index()
    factor_break_rate.rename(columns={'mean': '断纱率', 'count': '样本数'}, inplace=True)

    # 获取区间范围
    factor_break_rate['区间范围'] = factor_break_rate[factor_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # 绘制柱状图
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(data=factor_break_rate, x='区间范围', y='断纱率', palette='viridis')

    # 添加样本数标注
    for i, row in factor_break_rate.iterrows():
        ax.text(i, row['断纱率']+0.01, f"n={row['样本数']}", ha='center', fontsize=9)

    # 标注断纱率最高的区间
    max_idx = factor_break_rate['断纱率'].idxmax()
    max_row = factor_break_rate.loc[max_idx]
    plt.axvline(x=max_idx, color='red', linestyle='--', alpha=0.5)
    plt.text(max_idx, max_row['断纱率']+0.05,
             f"最高断纱率: {max_row['断纱率']:.2%}\n区间: {max_row['区间范围']}",
             ha='center', color='red')

    plt.title(f'{factor_name or factor_col} 对断纱率的影响')
    plt.xlabel(factor_name or factor_col)
    plt.ylabel('断纱率')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return factor_break_rate

# 3. 双因素交互分析
def interaction_analysis(df, factor1_col, factor2_col, target_col, factor1_name=None, factor2_name=None):
    """双因素交互分析函数"""
    # 计算每个组合区间的断纱率
    interaction_break_rate = df.groupby([factor1_col, factor2_col])[target_col].agg(['mean', 'count']).reset_index()
    interaction_break_rate.rename(columns={'mean': '断纱率', 'count': '样本数'}, inplace=True)

    # 创建区间范围标签
    interaction_break_rate['factor1_range'] = interaction_break_rate[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")
    interaction_break_rate['factor2_range'] = interaction_break_rate[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # 创建热力图
    pivot_table = interaction_break_rate.pivot(index='factor1_range', columns='factor2_range', values='断纱率')
    plt.figure(figsize=(12, 8))
    ax = sns.heatmap(pivot_table, annot=True, fmt=".2%", cmap="YlGnBu",
                     annot_kws={"size": 9}, cbar_kws={'label': '断纱率'})

    # 标注断纱率最高的组合区间
    max_idx = interaction_break_rate['断纱率'].idxmax()
    max_row = interaction_break_rate.loc[max_idx]
    plt.title(f'{factor1_name or factor1_col} × {factor2_name or factor2_col} 对断纱率的影响\n'
              f"断纱率最高的组合: {max_row['factor1_range']} × {max_row['factor2_range']} = {max_row['断纱率']:.2%}",
              pad=20)

    plt.xlabel(factor2_name or factor2_col)
    plt.ylabel(factor1_name or factor1_col)
    plt.tight_layout()
    plt.show()

    # 统计显著性检验（卡方检验）
    # 创建交叉表时使用原始数据而不是平均值
    contingency_table = pd.crosstab(
        df[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        df[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        values=df[target_col],
        aggfunc='sum'  # 改为求和而不是平均值
    )

    # 检查是否有零频数
    if (contingency_table == 0).any().any():
        print("警告: 交叉表中存在零频数，卡方检验可能不准确")
        # 可以尝试合并一些类别或添加小常数
        contingency_table = contingency_table + 0.5  # 添加0.5的连续性校正

    try:
        chi2, p, _, _ = chi2_contingency(contingency_table.fillna(0))
        print(f"卡方检验结果: p值 = {p:.4f}")
    except ValueError as e:
        print(f"无法执行卡方检验: {str(e)}")
        p = None

    return interaction_break_rate, p

# 主程序
if __name__ == "__main__":
    # 加载数据（替换为你的实际数据路径）
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    # 配置参数（根据实际列名修改）
    dw_number = 1  # 锭号

    # 1. 数据预处理
    print("正在进行数据预处理...")
    df_processed, full_bins, speed_bins = preprocess_data(df, dw_number=dw_number)

    # 打印分箱边界
    print("\n满卷率分箱边界:", full_bins)
    print("速度偏移率分箱边界:", speed_bins)

    # 2. 单因素分析
    print("\n单因素分析...")
    fullness_break_rate = single_factor_analysis(
        df_processed,
        '满卷率分箱',
        f'is_break_dw{dw_number}',
        factor_name='满卷率'
    )

    speed_deviation_break_rate = single_factor_analysis(
        df_processed,
        '速度偏移率分箱',
        f'is_break_dw{dw_number}',
        factor_name='速度偏移率(%)'
    )

    # 3. 双因素交互分析
    print("\n双因素交互分析...")
    interaction_break_rate, p_value = interaction_analysis(
        df_processed,
        '满卷率分箱',
        '速度偏移率分箱',
        f'is_break_dw{dw_number}',
        factor1_name='满卷率',
        factor2_name='速度偏移率(%)'
    )

    # 保存结果
    fullness_break_rate.to_csv('fullness_break_rate.csv', index=False)
    speed_deviation_break_rate.to_csv('speed_deviation_break_rate.csv', index=False)
    interaction_break_rate.to_csv('interaction_break_rate.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression

# Set font and display settings
plt.rcParams['font.sans-serif'] = ['SimHei']  # Use SimHei for Chinese characters
plt.rcParams['axes.unicode_minus'] = False  # Correct display of negative signs

# 1. Data Preprocessing
def preprocess_data(df, dw_number=1):
    """Data preprocessing function"""
    # Create yarn break flag column
    target_col = f'is_break_dw{dw_number}'
    df[target_col] = df[f'D_dw{dw_number}'].apply(lambda x: 1 if x == 3 else 0)

    # Calculate speed deviation rate (percentage deviation from median speed)
    speed_col = f'V_dw{dw_number}'
    reference_speed = df['R_dw22'].median()  # Use median of R_dw22 as reference speed
    df['Speed_Deviation(%)'] = ((df[speed_col] - reference_speed) / reference_speed) * 100

    # Bin fullness rate and speed deviation rate
    df['Fullness_Bin'], full_bins = pd.cut(df[f'P_dw{dw_number}'], bins=10, retbins=True)
    df['Speed_Deviation_Bin'], speed_bins = pd.cut(df['Speed_Deviation(%)'], bins=10, retbins=True)

    # Process equipment name and side
    df['Equipment_Name'] = df['name'].apply(lambda x: x.split('-')[0])  # Extract equipment name (e.g., NX78)
    df['Equipment_Side'] = df['subsystem'].apply(lambda x: x.split('-')[1])  # Extract equipment side (e.g., R103)

    return df, full_bins, speed_bins

# 2. Single Factor Analysis
def single_factor_analysis(df, factor_col, target_col, bins=None, factor_name=None):
    """Single factor analysis function"""
    # Calculate break rate for each bin
    factor_break_rate = df.groupby(factor_col)[target_col].agg(['mean', 'count']).reset_index()
    factor_break_rate.rename(columns={'mean': 'Break_Rate', 'count': 'Sample_Count'}, inplace=True)

    # Get bin ranges
    factor_break_rate['Bin_Range'] = factor_break_rate[factor_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # Plot bar chart with enhanced formatting for the first plot
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(data=factor_break_rate, x='Bin_Range', y='Break_Rate', palette='viridis')

    # Add sample count annotations
    for i, row in factor_break_rate.iterrows():
        ax.text(i, row['Break_Rate']+0.01, f"n={row['Sample_Count']}", ha='center', fontsize=9)

    # Highlight bin with highest break rate
    max_idx = factor_break_rate['Break_Rate'].idxmax()
    max_row = factor_break_rate.loc[max_idx]
    plt.axvline(x=max_idx, color='red', linestyle='--', alpha=0.5)
    plt.text(max_idx, max_row['Break_Rate']+0.05,
             f"Highest Break Rate: {max_row['Break_Rate']:.2%}\nRange: {max_row['Bin_Range']}",
             ha='center', color='red')

    # Enhanced axis labels with bold font and larger size
    plt.title(f'Effect of {factor_name or factor_col} on Yarn Break Rate', fontweight='bold', pad=20)
    plt.xlabel(factor_name or factor_col, fontweight='bold', fontsize=12)
    plt.ylabel('Break Rate', fontweight='bold', fontsize=12)

    # Make tick labels more visible
    plt.xticks(rotation=45, fontsize=10)
    plt.yticks(fontsize=10)

    # Make axis spines bolder
    for spine in ax.spines.values():
        spine.set_linewidth(1.5)

    plt.tight_layout()
    plt.show()

    return factor_break_rate

# 3. Interaction Analysis
def interaction_analysis(df, factor1_col, factor2_col, target_col, factor1_name=None, factor2_name=None):
    """Two-factor interaction analysis function"""
    # Calculate break rate for each combination
    interaction_break_rate = df.groupby([factor1_col, factor2_col])[target_col].agg(['mean', 'count']).reset_index()
    interaction_break_rate.rename(columns={'mean': 'Break_Rate', 'count': 'Sample_Count'}, inplace=True)

    # Create bin range labels
    interaction_break_rate['factor1_range'] = interaction_break_rate[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")
    interaction_break_rate['factor2_range'] = interaction_break_rate[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # Create heatmap with enhanced formatting for the last plot
    pivot_table = interaction_break_rate.pivot(index='factor1_range', columns='factor2_range', values='Break_Rate')
    plt.figure(figsize=(12, 8))
    ax = sns.heatmap(pivot_table, annot=True, fmt=".2%", cmap="YlGnBu",
                     annot_kws={"size": 9, "weight": 'bold'},
                     cbar_kws={'label': 'Break Rate'})

    # Highlight combination with highest break rate
    max_idx = interaction_break_rate['Break_Rate'].idxmax()
    max_row = interaction_break_rate.loc[max_idx]
    plt.title(f'Interaction Effect of {factor1_name or factor1_col} × {factor2_name or factor2_col} on Yarn Break Rate\n'
              f"Highest Break Rate: {max_row['factor1_range']} × {max_row['factor2_range']} = {max_row['Break_Rate']:.2%}",
              fontweight='bold', pad=20)

    # Enhanced axis labels with bold font and larger size
    plt.xlabel(factor2_name or factor2_col, fontweight='bold', fontsize=12)
    plt.ylabel(factor1_name or factor1_col, fontweight='bold', fontsize=12)

    # Make tick labels more visible
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=10, rotation=45)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10, rotation=0)

    # Make colorbar label bold
    cbar = ax.collections[0].colorbar
    cbar.ax.set_ylabel('Break Rate', fontweight='bold', fontsize=12)

    plt.tight_layout()
    plt.show()

    # Statistical significance test (Chi-square test)
    # Create contingency table using raw data
    contingency_table = pd.crosstab(
        df[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        df[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        values=df[target_col],
        aggfunc='sum'  # Use sum instead of mean
    )

    # Check for zero frequencies
    if (contingency_table == 0).any().any():
        print("Warning: Zero frequencies detected in contingency table, chi-square test may be inaccurate")
        # Apply continuity correction
        contingency_table = contingency_table + 0.5

    try:
        chi2, p, _, _ = chi2_contingency(contingency_table.fillna(0))
        print(f"Chi-square test result: p-value = {p:.4f}")
    except ValueError as e:
        print(f"Unable to perform chi-square test: {str(e)}")
        p = None

    return interaction_break_rate, p

# Main program
if __name__ == "__main__":
    # Load data (replace with your actual data path)
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    # Configuration parameters (modify according to actual column names)
    dw_number = 1  # Spindle number

    # 1. Data Preprocessing
    print("Performing data preprocessing...")
    df_processed, full_bins, speed_bins = preprocess_data(df, dw_number=dw_number)

    # Print bin boundaries
    print("\nFullness rate bin boundaries:", full_bins)
    print("Speed deviation rate bin boundaries:", speed_bins)

    # 2. Single Factor Analysis
    print("\nSingle factor analysis...")
    fullness_break_rate = single_factor_analysis(
        df_processed,
        'Fullness_Bin',
        f'is_break_dw{dw_number}',
        factor_name='Fullness Rate'
    )

    speed_deviation_break_rate = single_factor_analysis(
        df_processed,
        'Speed_Deviation_Bin',
        f'is_break_dw{dw_number}',
        factor_name='Speed Deviation(%)'
    )

    # 3. Interaction Analysis
    print("\nTwo-factor interaction analysis...")
    interaction_break_rate, p_value = interaction_analysis(
        df_processed,
        'Fullness_Bin',
        'Speed_Deviation_Bin',
        f'is_break_dw{dw_number}',
        factor1_name='Fullness Rate',
        factor2_name='Speed Deviation(%)'
    )

    # Save results
    fullness_break_rate.to_csv('fullness_break_rate.csv', index=False)
    speed_deviation_break_rate.to_csv('speed_deviation_break_rate.csv', index=False)
    interaction_break_rate.to_csv('interaction_break_rate.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# 设置专业学术风格参数
plt.style.use('default')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.weight'] = 'bold'  # 使用bold保持专业感
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.linewidth'] = 2.5  # 适中轴线宽度
plt.rcParams['font.size'] = 18  # 适度字体大小
plt.rcParams['axes.titlesize'] = 16  # 标题大小
plt.rcParams['axes.labelsize'] = 20  # 轴标签大小
plt.rcParams['xtick.labelsize'] = 18  # 刻度标签
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 13
plt.rcParams['grid.linewidth'] = 1.2  # 细网格线
plt.rcParams['lines.linewidth'] = 3.5  # 数据线宽度
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['font.family'] = 'Arial'  # 使用更专业的英文字体
plt.rcParams['axes.unicode_minus'] = False


def interaction_analysis(df, factor1_col, factor2_col, target_col, factor1_name=None, factor2_name=None):
    """Two-factor interaction analysis function with professional academic style"""
    # Calculate break rate for each combination
    interaction_break_rate = df.groupby([factor1_col, factor2_col])[target_col].agg(['mean', 'count']).reset_index()
    interaction_break_rate.rename(columns={'mean': 'Break_Rate', 'count': 'Sample_Count'}, inplace=True)

    # Create bin range labels
    interaction_break_rate['factor1_range'] = interaction_break_rate[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")
    interaction_break_rate['factor2_range'] = interaction_break_rate[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # Create heatmap with enhanced formatting
    pivot_table = interaction_break_rate.pivot(index='factor1_range', columns='factor2_range', values='Break_Rate')

    # Create figure with professional academic style
    fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')

    # Draw heatmap with grid lines
    sns.heatmap(pivot_table, annot=True, fmt=".2%", cmap="YlGnBu",
                annot_kws={"size": 13, "weight": 'bold'},
                cbar_kws={'label': 'Break Rate', 'shrink': 0.8}, ax=ax,
                linewidths=0.5, linecolor='black')  # 添加格子线

    # Highlight combination with highest break rate
    max_idx = interaction_break_rate['Break_Rate'].idxmax()
    max_row = interaction_break_rate.loc[max_idx]
    # plt.title(f'Interaction Effect of {factor1_name or factor1_col} × {factor2_name or factor2_col} on Yarn Break Rate\n',
    #           # f"Highest Break Rate: {max_row['factor1_range']} × {max_row['factor2_range']} = {max_row['Break_Rate']:.2%}",
    #           fontweight='bold', pad=15, fontsize=14)

    # Enhanced axis labels with bold font and larger size
    plt.xlabel(factor2_name or factor2_col, fontweight='bold', fontsize=19, labelpad=10)
    plt.ylabel(factor1_name or factor1_col, fontweight='bold', fontsize=19, labelpad=10)

    # Make tick labels more visible
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=18, rotation=45, ha='right')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=18, rotation=0)

    # Make colorbar label bold
    cbar = ax.collections[0].colorbar
    cbar.ax.set_ylabel('Break Rate', fontweight='bold', fontsize=15)

    # 设置边框线
    for spine in ax.spines.values():
        spine.set_linewidth(5)  # 加粗边框线
        spine.set_color('black')  # 设置边框颜色为黑色

    # 调整刻度标签颜色和粗细
    ax.tick_params(axis='x', colors='black', width=2, length=6, labelsize=18)
    ax.tick_params(axis='y', colors='black', width=2, length=6, labelsize=18)

    # 设置刻度标签加粗
    for tick in ax.get_xticklabels():
        tick.set_fontweight('bold')
    for tick in ax.get_yticklabels():
        tick.set_fontweight('bold')

    # 优化刻度设置
    ax.tick_params(axis='both', which='major',
                   width=1.5, length=6,
                   labelsize=12, colors='black', pad=5)

    # 添加XY轴格子线
    ax.grid(True, which='both', linestyle='--', linewidth=0.8, color='gray', alpha=0.6)

    # 确保白色背景
    fig.patch.set_facecolor('white')
    ax.set_facecolor('white')

    # 最终调整元素间距
    plt.subplots_adjust(left=0.12, right=0.95, top=0.88, bottom=0.15)

    # 保存高质量图片
    plt.savefig('interaction_heatmap.png', bbox_inches='tight', dpi=300)
    plt.show()

    # Statistical significance test (Chi-square test)
    # Create contingency table using raw data
    contingency_table = pd.crosstab(
        df[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        df[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        values=df[target_col],
        aggfunc='sum'  # Use sum instead of mean
    )

    # Check for zero frequencies
    if (contingency_table == 0).any().any():
        print("Warning: Zero frequencies detected in contingency table, chi-square test may be inaccurate")
        # Apply continuity correction
        contingency_table = contingency_table + 0.5

    try:
        chi2, p, _, _ = chi2_contingency(contingency_table.fillna(0))
        print(f"Chi-square test result: p-value = {p:.4f}")
    except ValueError as e:
        print(f"Unable to perform chi-square test: {str(e)}")
        p = None

    return interaction_break_rate, p

# Main program
if __name__ == "__main__":
    # Load data (replace with your actual data path)
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    # Configuration parameters (modify according to actual column names)
    dw_number = 1  # Spindle number

    # 1. Data Preprocessing
    print("Performing data preprocessing...")
    df_processed, full_bins, speed_bins = preprocess_data(df, dw_number=dw_number)

    # 3. Interaction Analysis
    print("\nTwo-factor interaction analysis...")
    interaction_break_rate, p_value = interaction_analysis(
        df_processed,
        'Fullness_Bin',
        'Speed_Deviation_Bin',
        f'is_break_dw{dw_number}',
        factor1_name='Fullness Rate',
        factor2_name='Speed Deviation(%)'
    )

    # Save results
    interaction_break_rate.to_csv('interaction_break_rate.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# 设置专业学术风格参数
plt.style.use('default')
plt.rcParams['figure.dpi'] = 1200
plt.rcParams['savefig.dpi'] = 1200
plt.rcParams['font.weight'] = 'bold'  # 使用bold保持专业感
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.linewidth'] = 2.5  # 适中轴线宽度
plt.rcParams['font.size'] = 18  # 适度字体大小
plt.rcParams['axes.titlesize'] = 16  # 标题大小
plt.rcParams['axes.labelsize'] = 20  # 轴标签大小
plt.rcParams['xtick.labelsize'] = 18  # 刻度标签
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 13
plt.rcParams['grid.linewidth'] = 1.2  # 细网格线
plt.rcParams['lines.linewidth'] = 3.5  # 数据线宽度
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['font.family'] = 'Arial'  # 使用更专业的英文字体
plt.rcParams['axes.unicode_minus'] = False


def interaction_analysis(df, factor1_col, factor2_col, target_col, factor1_name=None, factor2_name=None):
    """Two-factor interaction analysis function with professional academic style"""
    # Calculate break rate for each combination
    interaction_break_rate = df.groupby([factor1_col, factor2_col])[target_col].agg(['mean', 'count']).reset_index()
    interaction_break_rate.rename(columns={'mean': 'Break_Rate', 'count': 'Sample_Count'}, inplace=True)

    # Create bin range labels
    interaction_break_rate['factor1_range'] = interaction_break_rate[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")
    interaction_break_rate['factor2_range'] = interaction_break_rate[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # Create heatmap with enhanced formatting
    pivot_table = interaction_break_rate.pivot(index='factor1_range', columns='factor2_range', values='Break_Rate')

    # Create figure with professional academic style
    fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')

    # Draw heatmap with grid lines
    sns.heatmap(pivot_table, annot=True, fmt=".2%", cmap="YlGnBu",
                annot_kws={"size": 13, "weight": 'bold'},
                cbar_kws={'label': 'Break Rate', 'shrink': 0.8}, ax=ax,
                linewidths=0.5, linecolor='black')  # 添加格子线

    # Highlight combination with highest break rate
    max_idx = interaction_break_rate['Break_Rate'].idxmax()
    max_row = interaction_break_rate.loc[max_idx]
    # plt.title(f'Interaction Effect of {factor1_name or factor1_col} × {factor2_name or factor2_col} on Yarn Break Rate\n',
    #           # f"Highest Break Rate: {max_row['factor1_range']} × {max_row['factor2_range']} = {max_row['Break_Rate']:.2%}",
    #           fontweight='bold', pad=15, fontsize=14)

    # Enhanced axis labels with bold font and larger size
    plt.xlabel(factor2_name or factor2_col, fontweight='bold', fontsize=19, labelpad=10)
    plt.ylabel(factor1_name or factor1_col, fontweight='bold', fontsize=19, labelpad=10)

    # Make tick labels more visible
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=18, rotation=45, ha='right')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=18, rotation=0)

    # Make colorbar label bold
    cbar = ax.collections[0].colorbar
    cbar.ax.set_ylabel('Break Rate', fontweight='bold', fontsize=15)

    # 设置边框线
    for spine in ax.spines.values():
        spine.set_linewidth(2)  # 加粗边框线
        spine.set_color('black')  # 设置边框颜色为黑色

    # 确保所有边框都显示（包括顶部和右侧）
    ax.spines['top'].set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(True)

    # 调整刻度标签颜色和粗细
    ax.tick_params(axis='x', colors='black', width=2, length=6, labelsize=18)
    ax.tick_params(axis='y', colors='black', width=2, length=6, labelsize=18)

    # 设置刻度标签加粗
    for tick in ax.get_xticklabels():
        tick.set_fontweight('bold')
    for tick in ax.get_yticklabels():
        tick.set_fontweight('bold')

    # 优化刻度设置
    ax.tick_params(axis='both', which='major',
                   width=1.5, length=6,
                   labelsize=12, colors='black', pad=5)

    # 添加XY轴格子线
    ax.grid(True, which='both', linestyle='--', linewidth=0.8, color='gray', alpha=0.6)

    # 确保白色背景
    fig.patch.set_facecolor('white')
    ax.set_facecolor('white')

    # 最终调整元素间距
    plt.subplots_adjust(left=0.12, right=0.95, top=0.88, bottom=0.15)

    # 保存高质量图片
    plt.savefig('interaction_heatmap.png', bbox_inches='tight', dpi=1200)
    plt.show()

    # Statistical significance test (Chi-square test)
    # Create contingency table using raw data
    contingency_table = pd.crosstab(
        df[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        df[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        values=df[target_col],
        aggfunc='sum'  # Use sum instead of mean
    )

    # Check for zero frequencies
    if (contingency_table == 0).any().any():
        print("Warning: Zero frequencies detected in contingency table, chi-square test may be inaccurate")
        # Apply continuity correction
        contingency_table = contingency_table + 0.5

    try:
        chi2, p, _, _ = chi2_contingency(contingency_table.fillna(0))
        print(f"Chi-square test result: p-value = {p:.4f}")
    except ValueError as e:
        print(f"Unable to perform chi-square test: {str(e)}")
        p = None

    return interaction_break_rate, p

# Main program
if __name__ == "__main__":
    # Load data (replace with your actual data path)
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    # Configuration parameters (modify according to actual column names)
    dw_number = 1  # Spindle number

    # 1. Data Preprocessing
    print("Performing data preprocessing...")
    df_processed, full_bins, speed_bins = preprocess_data(df, dw_number=dw_number)

    # 3. Interaction Analysis
    print("\nTwo-factor interaction analysis...")
    interaction_break_rate, p_value = interaction_analysis(
        df_processed,
        'Fullness_Bin',
        'Speed_Deviation_Bin',
        f'is_break_dw{dw_number}',
        factor1_name='Fullness Rate',
        factor2_name='Speed Deviation(%)'
    )

    # Save results
    interaction_break_rate.to_csv('interaction_break_rate.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker

# 设置全局样式参数
plt.style.use('default')
plt.rcParams.update({
    'figure.dpi': 1200,
    'savefig.dpi': 1200,
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'axes.edgecolor': 'black',
    'axes.linewidth': 4,
    'xtick.color': 'black',
    'ytick.color': 'black',
    'xtick.major.width': 4,
    'ytick.major.width': 4,
    'xtick.major.size': 16,
    'ytick.major.size': 16,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'savefig.facecolor': 'white',
    'axes.labelsize': 30,
    'axes.titlesize': 28,
    'xtick.labelsize': 22,
    'ytick.labelsize': 22,
    'legend.fontsize': 20,
    'legend.title_fontsize': 22,
    'font.family': 'Arial',
    'axes.unicode_minus': False
})

# 1. 数据预处理函数
def preprocess_data(df, dw_number=1):
    """数据预处理函数"""
    target_col = f'is_break_dw{dw_number}'
    df[target_col] = df[f'D_dw{dw_number}'].apply(lambda x: 1 if x == 3 else 0)

    speed_col = f'V_dw{dw_number}'
    reference_speed = df['R_dw22'].median()
    df['Speed_Deviation(%)'] = ((df[speed_col] - reference_speed) / reference_speed) * 100

    df['Fullness_Bin'], full_bins = pd.cut(df[f'P_dw{dw_number}'], bins=10, retbins=True)

    return df, full_bins

# 2. 单因素分析函数（使用更明亮的配色）
def single_factor_analysis(df, factor_col, target_col, bins=None, factor_name=None):
    """专业学术风格的单因素分析可视化"""
    factor_break_rate = df.groupby(factor_col)[target_col].agg(['mean', 'count']).reset_index()
    factor_break_rate.rename(columns={'mean': 'Break_Rate', 'count': 'Sample_Count'}, inplace=True)

    factor_break_rate['Bin_Range'] = factor_break_rate[factor_col].apply(
        lambda x: f"{x.left:.2f}-{x.right:.2f}" if pd.notna(x.left) else "NA"
    )

    plt.figure(figsize=(12, 6))

    sns.set_style("whitegrid", {
        'axes.edgecolor': '0.2',
        'grid.color': '0.85',
        'grid.linestyle': ':',
        'axes.linewidth': 2.5
    })
    sns.set_context("paper", font_scale=1.0, rc={
        "font.size": 16,
        "axes.titlesize": 18,
        "axes.labelsize": 24,
        "xtick.labelsize": 18,
        "ytick.labelsize": 18,
        "legend.fontsize": 18,
        "font.family": "Arial",
        "mathtext.default": "regular",
        "mathtext.fontset": "custom",
        "mathtext.it": "Arial:italic",
        "mathtext.rm": "Arial",
        "mathtext.sf": "Arial",
        "mathtext.tt": "Arial",
    })

    # 使用更明亮的调色板（例如 "viridis" 或 "plasma"）
    custom_palette = sns.color_palette("viridis", n_colors=len(factor_break_rate))

    ax = sns.barplot(data=factor_break_rate, x='Bin_Range', y='Break_Rate',
                     palette=custom_palette, saturation=0.9, width=0.85,
                     edgecolor='black', linewidth=2.5)

    max_break_rate = factor_break_rate['Break_Rate'].max()
    y_max = max_break_rate * 1.15 if max_break_rate > 0 else 0.15
    plt.ylim(0, y_max)

    for i, row in factor_break_rate.iterrows():
        text_y_pos = max(row['Break_Rate'] * 0.25, y_max * 0.05)
        # ax.text(i, text_y_pos, f"n={row['Sample_Count']}",
        #         ha='center', fontsize=15, fontweight='bold', color='white')

    max_idx = factor_break_rate['Break_Rate'].idxmax()
    max_row = factor_break_rate.loc[max_idx]

    plt.axvline(x=max_idx, color='#E63946', linestyle='--', alpha=1, linewidth=3, ymax=1)

    plt.text(max_idx, y_max * 0.8,
             f"Highest: {max_row['Break_Rate']:.2%}\n({max_row['Bin_Range']})",
             ha='center', color='#E63946', fontsize=15, fontweight='bold',
             bbox=dict(facecolor='white', alpha=1, edgecolor='#E63946',
                      boxstyle='round,pad=1.0', linewidth=2.5))

    plt.xlabel(factor_name or factor_col,
               fontweight='bold',
               fontsize=22,
               labelpad=12)
    plt.ylabel('Break Rate',
               fontweight='bold',
               fontsize=22,
               labelpad=12)

    ax.yaxis.set_major_formatter(ticker.PercentFormatter(1.0))

    ax.tick_params(axis='both', which='major',
                   width=2, length=8,
                   labelsize=20, pad=8)
    ax.tick_params(axis='both', which='minor', width=2, length=5)

    plt.xticks(rotation=45, fontsize=18, fontweight='bold',
               ha='right', rotation_mode='anchor')
    plt.yticks(fontsize=18, fontweight='bold')

    plt.margins(x=0.12)

    for spine in ax.spines.values():
        spine.set_linewidth(2.5)
        spine.set_color('black')

    ax.yaxis.grid(True, linestyle=':', alpha=0.5, linewidth=1.2)
    ax.xaxis.grid(False)

    ax.set_facecolor('#f8f9fa')

    plt.gca().add_patch(plt.Rectangle((0, 0), 1, 1, transform=ax.transAxes,
                                     fill=False, edgecolor='gray', linewidth=2.5,
                                     alpha=0.3, zorder=-1))

    plt.tight_layout(pad=2.5)
    plt.subplots_adjust(left=0.12, right=0.95, top=0.9, bottom=0.15)

    plt.savefig('yarn_break_rate_analysis_professional.png',
                dpi=1200,
                bbox_inches='tight',
                pad_inches=0.7,
                transparent=False,
                facecolor='white')
    plt.show()

    return factor_break_rate

# 主程序
if __name__ == "__main__":
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    dw_number = 1

    print("Performing data preprocessing...")
    df_processed, full_bins = preprocess_data(df, dw_number=dw_number)

    print("\nFullness rate bin boundaries:", full_bins)

    print("\nSingle factor analysis...")
    fullness_break_rate = single_factor_analysis(
        df_processed,
        'Fullness_Bin',
        f'is_break_dw{dw_number}',
        factor_name='Fullness Rate'
    )

    fullness_break_rate.to_csv('fullness_break_rate.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression

# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号

# 1. 数据预处理
def preprocess_data(df, dw_number=1):
    """数据预处理函数"""
    # 创建断纱标志列
    target_col = f'is_break_dw{dw_number}'
    df[target_col] = df[f'D_dw{dw_number}'].apply(lambda x: 1 if x == 3 else 0)

    # 计算速度偏移率（当前锭子速度与所有锭子速度中位数的偏差百分比）
    speed_col = f'V_dw{dw_number}'
    reference_speed = df['R_dw22'].median()  # 使用 R_dw22 的中位数作为参考速度
    df['速度偏移率(%)'] = ((df[speed_col] - reference_speed) / reference_speed) * 100

    # 将满卷率和速度偏移率分箱（binning），并保留区间标签
    df['满卷率分箱'], full_bins = pd.cut(df[f'P_dw{dw_number}'], bins=10, retbins=True)
    df['速度偏移率分箱'], speed_bins = pd.cut(df['速度偏移率(%)'], bins=10, retbins=True)

    # 处理设备名称和对应面
    df['设备名称'] = df['name'].apply(lambda x: x.split('-')[0])  # 提取设备名称（如 NX78）
    df['设备面'] = df['subsystem'].apply(lambda x: x.split('-')[1])  # 提取设备面（如 R103）

    return df, full_bins, speed_bins

# 2. 单因素分析
def single_factor_analysis(df, factor_col, target_col, bins=None, factor_name=None):
    """单因素分析函数"""
    # 计算每个区间的断纱率
    factor_break_rate = df.groupby(factor_col)[target_col].agg(['mean', 'count']).reset_index()
    factor_break_rate.rename(columns={'mean': '断纱率', 'count': '样本数'}, inplace=True)

    # 获取区间范围
    factor_break_rate['区间范围'] = factor_break_rate[factor_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # 绘制柱状图
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(data=factor_break_rate, x='区间范围', y='断纱率', palette='viridis')

    # 添加样本数标注
    for i, row in factor_break_rate.iterrows():
        ax.text(i, row['断纱率']+0.01, f"n={row['样本数']}", ha='center', fontsize=9)

    # 标注断纱率最高的区间
    max_idx = factor_break_rate['断纱率'].idxmax()
    max_row = factor_break_rate.loc[max_idx]
    plt.axvline(x=max_idx, color='red', linestyle='--', alpha=0.5)
    plt.text(max_idx, max_row['断纱率']+0.05,
             f"最高断纱率: {max_row['断纱率']:.2%}\n区间: {max_row['区间范围']}",
             ha='center', color='red')

    plt.title(f'{factor_name or factor_col} 对断纱率的影响')
    plt.xlabel(factor_name or factor_col)
    plt.ylabel('断纱率')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return factor_break_rate

# 3. 双因素交互分析
def interaction_analysis(df, factor1_col, factor2_col, target_col, factor1_name=None, factor2_name=None):
    """双因素交互分析函数"""
    # 计算每个组合区间的断纱率
    interaction_break_rate = df.groupby([factor1_col, factor2_col])[target_col].agg(['mean', 'count']).reset_index()
    interaction_break_rate.rename(columns={'mean': '断纱率', 'count': '样本数'}, inplace=True)

    # 创建区间范围标签
    interaction_break_rate['factor1_range'] = interaction_break_rate[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")
    interaction_break_rate['factor2_range'] = interaction_break_rate[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}")

    # 创建热力图
    pivot_table = interaction_break_rate.pivot(index='factor1_range', columns='factor2_range', values='断纱率')
    plt.figure(figsize=(12, 8))
    ax = sns.heatmap(pivot_table, annot=True, fmt=".2%", cmap="YlGnBu",
                     annot_kws={"size": 9}, cbar_kws={'label': '断纱率'})

    # 标注断纱率最高的组合区间
    max_idx = interaction_break_rate['断纱率'].idxmax()
    max_row = interaction_break_rate.loc[max_idx]
    plt.title(f'{factor1_name or factor1_col} × {factor2_name or factor2_col} 对断纱率的影响\n'
              f"断纱率最高的组合: {max_row['factor1_range']} × {max_row['factor2_range']} = {max_row['断纱率']:.2%}",
              pad=20)

    plt.xlabel(factor2_name or factor2_col)
    plt.ylabel(factor1_name or factor1_col)
    plt.tight_layout()
    plt.show()

    # 统计显著性检验（卡方检验）
    contingency_table = pd.crosstab(
        df[factor1_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        df[factor2_col].apply(lambda x: f"{x.left:.2f}-{x.right:.2f}"),
        values=df[target_col],
        aggfunc='mean'
    )
    chi2, p, _, _ = chi2_contingency(contingency_table.fillna(0))
    print(f"卡方检验结果: p值 = {p:.4f}")

    return interaction_break_rate, p

# 主程序
if __name__ == "__main__":
    # 加载数据（替换为你的实际数据路径）
    df = pd.read_csv('D:/code/junma/600000/0424/2.csv')

    # 配置参数（根据实际列名修改）
    dw_number = 1  # 锭号

    # 1. 数据预处理
    print("正在进行数据预处理...")
    df_processed, full_bins, speed_bins = preprocess_data(df, dw_number=dw_number)

    # 打印分箱边界
    print("\n满卷率分箱边界:", full_bins)
    print("速度偏移率分箱边界:", speed_bins)

    # 2. 单因素分析
    print("\n单因素分析...")
    fullness_break_rate = single_factor_analysis(
        df_processed,
        '满卷率分箱',
        f'is_break_dw{dw_number}',
        factor_name='满卷率'
    )

    speed_deviation_break_rate = single_factor_analysis(
        df_processed,
        '速度偏移率分箱',
        f'is_break_dw{dw_number}',
        factor_name='速度偏移率(%)'
    )

    # 3. 双因素交互分析
    print("\n双因素交互分析...")
    interaction_break_rate, p_value = interaction_analysis(
        df_processed,
        '满卷率分箱',
        '速度偏移率分箱',
        f'is_break_dw{dw_number}',
        factor1_name='满卷率',
        factor2_name='速度偏移率(%)'
    )

    # 保存结果
    fullness_break_rate.to_csv('fullness_break_rate.csv', index=False)
    speed_deviation_break_rate.to_csv('speed_deviation_break_rate.csv', index=False)
    interaction_break_rate.to_csv('interaction_break_rate.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from IPython.display import display

plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号

def analyze_yarn_breakage(df, target_spindle='NX16-L102'):
    """
    分析锭子断纱率与满卷率、速度偏移率的交互作用

    Parameters:
        df: 原始数据
        target_spindle: 目标锭子编号（如 'NX16-L102'）
    """
    # 1. 数据预处理：计算速度偏移率
    target_spindle_num = int(target_spindle.split('-')[-1][3:])  # 提取锭子编号（如 L102 → 102）
    speed_col = f'V_dw{target_spindle_num}'
    fullness_col = f'P_dw{target_spindle_num}'
    break_col = f'D_dw{target_spindle_num}'

    # 计算参考速度（所有锭子的中位数）
    all_speed_cols = [f'V_dw{i}' for i in range(1, 25)]
    df['reference_speed'] = df[all_speed_cols].median(axis=1)
    df['speed_deviation_pct'] = ((df[speed_col] - df['reference_speed']) / df['reference_speed']) * 100

    # 提取目标锭子数据
    target_data = df[df['subsystem'] == target_spindle].copy()
    analysis_df = target_data[[break_col, fullness_col, 'speed_deviation_pct']].copy()
    analysis_df.columns = ['break_status', 'fullness', 'speed_deviation']

    # 2. 单因素分析：满卷率 vs 断纱率
    analysis_df['fullness_bin'] = pd.cut(
        analysis_df['fullness'],
        bins=np.linspace(0, 100, 11),
        labels=[f"{i}%-{i+10}%" for i in range(0, 100, 10)]
    )
    fullness_break_rate = analysis_df.groupby('fullness_bin')['break_status'].mean()

    # 3. 单因素分析：速度偏移率 vs 断纱率
    analysis_df['speed_deviation_bin'] = pd.cut(
        analysis_df['speed_deviation'],
        bins=np.linspace(-20, 20, 9),
        labels=[f"{i}% to {i+5}%" for i in range(-20, 20, 5)]
    )
    speed_break_rate = analysis_df.groupby('speed_deviation_bin')['break_status'].mean()

    # 4. 双因素交互分析：热力图
    joint_analysis = analysis_df.groupby(['fullness_bin', 'speed_deviation_bin']).agg(
        total_samples=('break_status', 'count'),
        break_count=('break_status', lambda x: (x == 3).sum()),
        break_rate=('break_status', lambda x: (x == 3).mean())
    ).reset_index()

    # 过滤样本量不足的区间（避免噪声）
    valid_groups = joint_analysis[joint_analysis['total_samples'] >= 5]
    heatmap_data = valid_groups.pivot(
        index='fullness_bin',
        columns='speed_deviation_bin',
        values='break_rate'
    )

    # 5. 统计检验（卡方检验）
    contingency_table = pd.crosstab(
        index=analysis_df['fullness_bin'],
        columns=analysis_df['speed_deviation_bin'],
        values=analysis_df['break_status'],
        aggfunc='mean'
    )
    chi2, p_value, _, _ = chi2_contingency(contingency_table.fillna(0))
    print(f"卡方检验 p-value = {p_value:.4f} (交互作用{'显著' if p_value < 0.05 else '不显著'})")

    # 6. 可视化
    plt.figure(figsize=(16, 6))

    # 单因素：满卷率 vs 断纱率
    plt.subplot(1, 2, 1)
    fullness_break_rate.plot(kind='bar', color='skyblue')
    plt.title("满卷率 vs 断纱率", fontsize=12)
    plt.xlabel("满卷率区间")
    plt.ylabel("断纱率")
    plt.xticks(rotation=45)

    # 单因素：速度偏移率 vs 断纱率
    plt.subplot(1, 2, 2)
    speed_break_rate.plot(kind='bar', color='salmon')
    plt.title("速度偏移率 vs 断纱率", fontsize=12)
    plt.xlabel("速度偏移率区间 (%)")
    plt.ylabel("断纱率")
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

    # 双因素：热力图
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        heatmap_data * 100,
        annot=True,
        fmt=".1f",
        cmap="YlOrRd",
        cbar_kws={'label': '断纱率 (%)'},
        annot_kws={"size": 10}
    )
    plt.title(
        f"锭子 {target_spindle} 断纱率分析\n满卷率 × 速度偏移率交互作用",
        fontsize=14
    )
    plt.xlabel("速度偏移率区间 (%)", fontsize=12)
    plt.ylabel("满卷率区间", fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return joint_analysis

# 主程序
if __name__ == "__main__":
    df = pd.read_csv('D:/code/junma/600000/0424/second_final_processed.csv')

    # 检查必要列是否存在
    required_cols = ['subsystem'] + [f'D_dw{i}' for i in range(1, 25)]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"缺少必要列: {missing_cols}")

    print("开始分析断纱率...")
    breakage_stats = analyze_yarn_breakage(df, target_spindle='NX16-L102')

    # 保存结果
    breakage_stats.to_csv('breakage_analysis_results.csv', index=False)
    print("\n分析完成！结果已保存至 breakage_analysis_results.csv")

In [None]:
# Final Prediction Model
final_model = best_global_model

# Make predictions on the test set using the final model
y_final_pred = final_model.predict(X_test)
final_y_pred = (y_final_pred)
final_y_test =(y_test)

In [None]:
# Create a DataFrame with the predicted prices and true prices
results = pd.DataFrame({'Predicted Value': final_y_pred, 'True Value': final_y_test})

# Calculate the difference between the true prices and predicted prices and add a new column
results['Difference'] = results['True Value'] - results['Predicted Value']

# Display the first 5 rows
print("First 5 rows:")
print(results.head())

# Display the last 5 rows
print("\nLast 5 rows:")
print(results.tail())

In [None]:
# Final Prediction Model
final_model = best_global_model

# Make predictions on the test set using the final model
y_final_pred = final_model.predict(X_test)
final_y_pred = (y_final_pred)
final_y_test = (y_test)

# Create a DataFrame with the predicted values and true values
results = pd.DataFrame({'Predicted Value': final_y_pred, 'True Value': final_y_test})

# Filter the results to only include rows where the predicted value is 1 (断纱)
results_break = results[results['Predicted Value'] == 1]

# Calculate the difference between the true values and predicted values and add a new column
results_break['Difference'] = results_break['True Value'] - results_break['Predicted Value']

# Display the first 5 rows of the filtered results
print("First 5 rows (Predicted Break = 1):")
print(results_break.head())

# Display the last 5 rows of the filtered results
print("\nLast 5 rows (Predicted Break = 1):")
print(results_break.tail())

# Optionally, save the filtered results to a CSV file
results_break.to_csv('predicted_break_results.csv', index=False)

In [None]:
# 加载优化后的模型进行最终预测
try:
    print("加载优化后的模型进行最终预测...")

    # 尝试加载优化后的模型
    optimized_model_info = joblib.load('rf_balanced_optimized_latest.pkl')
    final_model = optimized_model_info['model']
    best_params = optimized_model_info['best_params']

    print("优化模型加载成功！")
    print(f"使用的最佳参数: {best_params}")

    # 使用优化后的模型进行预测
    y_final_pred = final_model.predict(X_test)
    y_final_pred_proba = final_model.predict_proba(X_test)[:, 1]  # 预测概率

    # 确保数据类型一致
    final_y_pred = y_final_pred.astype(int)
    final_y_test = y_test.astype(int)

    # 创建完整的预测结果DataFrame
    results_full = pd.DataFrame({
        'Predicted_Value': final_y_pred,
        'True_Value': final_y_test,
        'Prediction_Probability': y_final_pred_proba,
        'Prediction_Correct': final_y_pred == final_y_test
    })

    # 添加预测状态描述
    results_full['Prediction_Status'] = results_full['Prediction_Correct'].map({
        True: '正确预测',
        False: '错误预测'
    })

    # 添加类别描述
    results_full['True_Label'] = results_full['True_Value'].map({0: '正常', 1: '断纱'})
    results_full['Predicted_Label'] = results_full['Predicted_Value'].map({0: '正常', 1: '断纱'})

    print("\n" + "="*80)
    print("优化模型预测结果总览")
    print("="*80)

    # 总体统计
    total_samples = len(results_full)
    correct_predictions = results_full['Prediction_Correct'].sum()
    overall_accuracy = correct_predictions / total_samples

    print(f"总样本数: {total_samples}")
    print(f"正确预测数: {correct_predictions}")
    print(f"总体准确率: {overall_accuracy:.4f} ({overall_accuracy:.2%})")

    # 1. 断纱预测结果分析 (预测为1的样本)
    print("\n" + "="*80)
    print("断纱预测结果分析 (预测值 = 1)")
    print("="*80)

    results_break = results_full[results_full['Predicted_Value'] == 1].copy()
    results_break['Difference'] = results_break['True_Value'] - results_break['Predicted_Value']

    break_total = len(results_break)
    break_correct = (results_break['True_Value'] == 1).sum()
    break_incorrect = (results_break['True_Value'] == 0).sum()
    break_accuracy = break_correct / break_total if break_total > 0 else 0

    print(f"预测为断纱的样本总数: {break_total}")
    print(f"其中实际为断纱(正确预测): {break_correct}")
    print(f"其中实际为正常(错误预测 - 误报): {break_incorrect}")
    print(f"断纱预测准确率: {break_accuracy:.4f} ({break_accuracy:.2%})")

    # 显示断纱预测的前后各5行
    print(f"\n前5个断纱预测样本:")
    break_display_cols = ['Predicted_Label', 'True_Label', 'Prediction_Probability', 'Prediction_Status']
    display(results_break[break_display_cols].head())

    print(f"\n后5个断纱预测样本:")
    display(results_break[break_display_cols].tail())

    # 2. 正常预测结果分析 (预测为0的样本)
    print("\n" + "="*80)
    print("正常预测结果分析 (预测值 = 0)")
    print("="*80)

    results_normal = results_full[results_full['Predicted_Value'] == 0].copy()
    results_normal['Difference'] = results_normal['True_Value'] - results_normal['Predicted_Value']

    normal_total = len(results_normal)
    normal_correct = (results_normal['True_Value'] == 0).sum()
    normal_incorrect = (results_normal['True_Value'] == 1).sum()
    normal_accuracy = normal_correct / normal_total if normal_total > 0 else 0

    print(f"预测为正常的样本总数: {normal_total}")
    print(f"其中实际为正常(正确预测): {normal_correct}")
    print(f"其中实际为断纱(错误预测 - 漏报): {normal_incorrect}")
    print(f"正常预测准确率: {normal_accuracy:.4f} ({normal_accuracy:.2%})")

    # 显示正常预测的前后各5行
    print(f"\n前5个正常预测样本:")
    normal_display_cols = ['Predicted_Label', 'True_Label', 'Prediction_Probability', 'Prediction_Status']
    display(results_normal[normal_display_cols].head())

    print(f"\n后5个正常预测样本:")
    display(results_normal[normal_display_cols].tail())

    # 3. 详细错误分析
    print("\n" + "="*80)
    print("详细错误分析")
    print("="*80)

    # 误报分析 (预测为1但实际为0)
    false_positives = results_break[results_break['True_Value'] == 0]
    print(f"误报数量 (预测断纱但实际正常): {len(false_positives)}")
    if len(false_positives) > 0:
        print("误报样本详情:")
        display(false_positives[['Prediction_Probability', 'True_Label', 'Predicted_Label']].head(10))

    # 漏报分析 (预测为0但实际为1)
    false_negatives = results_normal[results_normal['True_Value'] == 1]
    print(f"\n漏报数量 (预测正常但实际断纱): {len(false_negatives)}")
    if len(false_negatives) > 0:
        print("漏报样本详情:")
        display(false_negatives[['Prediction_Probability', 'True_Label', 'Predicted_Label']].head(10))

    # 4. 预测概率分布分析
    print("\n" + "="*80)
    print("预测概率分布分析")
    print("="*80)

    # 正确预测的概率分布
    correct_probabilities = results_full[results_full['Prediction_Correct'] == True]['Prediction_Probability']
    incorrect_probabilities = results_full[results_full['Prediction_Correct'] == False]['Prediction_Probability']

    print(f"正确预测的平均概率: {correct_probabilities.mean():.4f}")
    print(f"错误预测的平均概率: {incorrect_probabilities.mean():.4f}")
    print(f"正确预测的概率标准差: {correct_probabilities.std():.4f}")
    print(f"错误预测的概率标准差: {incorrect_probabilities.std():.4f}")

    # 按类别统计概率
    for true_class in [0, 1]:
        class_data = results_full[results_full['True_Value'] == true_class]
        class_name = '正常' if true_class == 0 else '断纱'
        print(f"\n{class_name}样本的预测概率统计:")
        print(f"  平均概率: {class_data['Prediction_Probability'].mean():.4f}")
        print(f"  概率中位数: {class_data['Prediction_Probability'].median():.4f}")
        print(f"  概率标准差: {class_data['Prediction_Probability'].std():.4f}")

    # 5. 保存详细结果
    print("\n" + "="*80)
    print("保存预测结果")
    print("="*80)

    # 保存完整结果
    results_full.to_csv('optimized_model_complete_predictions.csv', index=False)
    print("完整预测结果已保存至: optimized_model_complete_predictions.csv")

    # 保存断纱预测结果
    results_break.to_csv('optimized_model_break_predictions.csv', index=False)
    print("断纱预测结果已保存至: optimized_model_break_predictions.csv")

    # 保存正常预测结果
    results_normal.to_csv('optimized_model_normal_predictions.csv', index=False)
    print("正常预测结果已保存至: optimized_model_normal_predictions.csv")

    # 保存错误分析结果
    error_analysis = pd.concat([false_positives, false_negatives])
    if len(error_analysis) > 0:
        error_analysis.to_csv('optimized_model_error_analysis.csv', index=False)
        print("错误分析结果已保存至: optimized_model_error_analysis.csv")

    # 6. 生成预测结果汇总报告
    print("\n" + "="*80)
    print("预测结果汇总报告")
    print("="*80)

    summary_report = {
        '总样本数': total_samples,
        '正确预测数': correct_predictions,
        '总体准确率': f"{overall_accuracy:.4f} ({overall_accuracy:.2%})",
        '断纱预测总数': break_total,
        '断纱正确预测数': break_correct,
        '断纱误报数': break_incorrect,
        '断纱预测准确率': f"{break_accuracy:.4f} ({break_accuracy:.2%})",
        '正常预测总数': normal_total,
        '正常正确预测数': normal_correct,
        '正常漏报数': normal_incorrect,
        '正常预测准确率': f"{normal_accuracy:.4f} ({normal_accuracy:.2%})",
        '误报率 (False Positive Rate)': f"{break_incorrect/break_total:.4f} ({break_incorrect/break_total:.2%})" if break_total > 0 else "N/A",
        '漏报率 (False Negative Rate)': f"{normal_incorrect/normal_total:.4f} ({normal_incorrect/normal_total:.2%})" if normal_total > 0 else "N/A",
        '使用模型': 'RF_Optimized',
        '最佳参数': str(best_params)
    }

    summary_df = pd.DataFrame(list(summary_report.items()), columns=['指标', '值'])
    print("预测结果汇总:")
    display(summary_df)

    # 保存汇总报告
    summary_df.to_csv('optimized_model_prediction_summary.csv', index=False)
    print("\n预测汇总报告已保存至: optimized_model_prediction_summary.csv")

    # 7. 可视化预测结果 (可选)
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        print("\n生成预测结果可视化...")

        # 设置绘图样式
        plt.style.use('default')
        plt.rcParams.update({
            'font.family': 'Arial',
            'font.weight': 'bold',
            'axes.labelweight': 'bold',
            'axes.titleweight': 'bold',
            'font.size': 10
        })

        # 创建预测结果分布图
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # 1. 预测概率分布
        axes[0,0].hist(results_full['Prediction_Probability'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
        axes[0,0].set_xlabel('预测概率')
        axes[0,0].set_ylabel('频数')
        axes[0,0].set_title('预测概率分布')
        axes[0,0].grid(True, alpha=0.3)

        # 2. 正确与错误预测的概率分布
        axes[0,1].hist(correct_probabilities, bins=30, alpha=0.7, label='正确预测', color='green')
        axes[0,1].hist(incorrect_probabilities, bins=30, alpha=0.7, label='错误预测', color='red')
        axes[0,1].set_xlabel('预测概率')
        axes[0,1].set_ylabel('频数')
        axes[0,1].set_title('正确vs错误预测的概率分布')
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)

        # 3. 类别分布饼图
        prediction_counts = results_full['Predicted_Label'].value_counts()
        axes[1,0].pie(prediction_counts.values, labels=prediction_counts.index, autopct='%1.1f%%', startangle=90)
        axes[1,0].set_title('预测类别分布')

        # 4. 准确率条形图
        accuracy_data = [overall_accuracy, break_accuracy, normal_accuracy]
        accuracy_labels = ['总体准确率', '断纱预测准确率', '正常预测准确率']
        bars = axes[1,1].bar(accuracy_labels, accuracy_data, color=['blue', 'red', 'green'])
        axes[1,1].set_ylabel('准确率')
        axes[1,1].set_title('各类别预测准确率')
        axes[1,1].set_ylim(0, 1)

        # 在条形图上添加数值标签
        for bar, acc in zip(bars, accuracy_data):
            height = bar.get_height()
            axes[1,1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                          f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

        plt.tight_layout()
        plt.savefig('optimized_model_prediction_analysis.png', dpi=300, bbox_inches='tight')
        print("预测分析图已保存至: optimized_model_prediction_analysis.png")
        plt.show()

    except ImportError:
        print("Matplotlib/Seaborn 不可用，跳过可视化部分")

    print("\n优化模型预测分析完成！")

except FileNotFoundError:
    print("错误: 优化模型文件 'rf_balanced_optimized_latest.pkl' 未找到")
    print("请先运行模型优化代码")

except Exception as e:
    print(f"预测过程中发生错误: {e}")
    import traceback
    traceback.print_exc()

In [None]:
#Saving the Final CatBoost Model to Disk

from joblib import dump

dump(final_model, 'catboost_model.joblib')

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import joblib

def preprocess_data(df):
    # 创建目标变量
    y = df['broken_status']  # 目标变量

    # 1. 明确区分数值列和分类列
    # 数值列（确保只包含数值型数据）
    num_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
               if col != 'broken_status' and col in df.columns]

    # 分类列（明确指定或自动检测字符串列）
    cat_cols = [col for col in df.select_dtypes(include=['object', 'category']).columns
               if col in df.columns]

    # 2. 动态生成锭位相关列（确保只包含实际存在的列）
    spindle_cols = []
    for prefix in ['d_dw', 'v_dw', 'p_dw']:
        for i in range(1, 101):
            col_name = f"{prefix}{i}"
            if col_name in df.columns:
                spindle_cols.append(col_name)

    # 3. 合并所有特征列（确保分类列正确识别）
    feature_cols = num_cols + [col for col in cat_cols if col in df.columns] + spindle_cols

    # 4. 明确分类特征（锭位状态列通常是分类的）
    categorical_features = [col for col in feature_cols
                           if col in cat_cols or col.startswith('d_dw')]  # d_dw前缀的通常是分类变量
    numerical_features = [col for col in feature_cols
                         if col not in categorical_features and col in df.columns]

    # 确保所有特征列都存在于DataFrame中
    categorical_features = [col for col in categorical_features if col in df.columns]
    numerical_features = [col for col in numerical_features if col in df.columns]

    # 创建特征DataFrame
    X = df[feature_cols]

    return X, y, categorical_features, numerical_features

def build_preprocessor(categorical_features, numerical_features):
    # 数值型处理管道
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # 分类型处理管道（特别处理字符串数据）
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # 确保只传递实际存在的列
    transformers = []
    if numerical_features:
        transformers.append(('num', numeric_transformer, numerical_features))
    if categorical_features:
        transformers.append(('cat', categorical_transformer, categorical_features))

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop'  # 忽略不在转换器中的列
    )

    return preprocessor

def grid_search_tuning(X, y, preprocessor):
    # 定义管道
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    # 定义参数网格 - 20种组合
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
        'regressor__max_features': ['sqrt', 'log2', 0.5],
        'regressor__bootstrap': [True, False]
    }

    # 创建网格搜索对象
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=5,  # 5折交叉验证
        n_jobs=-1,  # 使用所有CPU核心
        verbose=2,
        scoring='neg_mean_squared_error'
    )

    print("开始网格搜索调优...")
    grid_search.fit(X, y)

    return grid_search

def main():
    # 加载数据 - 替换为你的实际数据路径
    df = pd.read_csv("D:/code/junma/600000/0424/1.csv")

    # 数据预处理
    X, y, categorical_features, numerical_features = preprocess_data(df)

    # 检查特征列
    print("数值特征:", numerical_features)
    print("分类特征:", categorical_features)

    # 构建预处理管道
    preprocessor = build_preprocessor(categorical_features, numerical_features)

    # 划分训练测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 网格搜索调优
    grid_search = grid_search_tuning(X_train, y_train, preprocessor)

    # 输出最佳参数
    print("\n最佳参数组合:")
    print(grid_search.best_params_)

    # 评估最佳模型
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n测试集MSE: {mse:.4f}")
    print(f"测试集R²: {r2:.4f}")

    # 特征重要性
    try:
        # 获取特征名称
        feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

        # 获取特征重要性
        importances = best_model.named_steps['regressor'].feature_importances_

        # 创建重要性DataFrame
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False).head(20)

        print("\n最重要的20个特征:")
        print(importance_df)

        # 可视化特征重要性
        plt.figure(figsize=(10, 6))
        plt.barh(importance_df['Feature'], importance_df['Importance'])
        plt.xlabel('Importance')
        plt.title('Top 20 Feature Importances')
        plt.gca().invert_yaxis()
        plt.show()
    except Exception as e:
        print(f"\n无法获取特征重要性: {str(e)}")

    # 保存模型
    joblib.dump(best_model, 'random_forest_regressor.pkl')
    print("\n模型已保存为 'random_forest_regressor.pkl'")

if __name__ == "__main__":
    main()

# <div style="padding: 30px; color:white; margin:10; font-size:150%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>21 |</span></b> <b>Conclusion</b></div>

<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <style>
        body {
            font-family: 'Verdana', sans-serif;
            background-color: #e6f7ff; /* Light blue background */
            color: #333333; /* Dark grey text */
            margin: 0;
            padding: 0;
            overflow-x: hidden; /* Prevent horizontal scrolling */
        }
        .container {
            max-width: 100%; /* Full width */
            margin: 40px auto;
            padding: 30px;
            border: 2px solid #008080; /* Teal border */
            border-radius: 15px;
            background-color: rgba(230, 247, 255, 0.6); /* Slightly transparent light blue background */
        }
        p {
            font-size: 18px;
            line-height: 1.6;
            margin: 20px 0;
        }
    </style>
</head>
<body>
    <div class="container">
        <p>In this study, we delved deep into optimizing biogas production in U.S. livestock farms using various machine learning models. The dataset encompassed various livestock types such as cattle, dairy cows, poultry, and swine.</p>
        <p>💡 <strong>Key Findings:</strong><br>
        Among all models, the LightGBM model emerged as the most proficient, boasting an R^2 score of 0.893, indicating that it can explain 89.3% of the variability in biogas production. Notably, it achieved the lowest RMSE of 114,838.761, underscoring its accuracy. The CatBoost model also showcased a commendable performance with an R^2 score of 0.852, although its execution time was relatively higher at 58.845 seconds.</p>
        <p>🚀 <strong>Implications:</strong><br>
        These findings pave the way for:
        <ul>
            <li>Optimized Farming Strategies: Farmers can optimize livestock types for maximum biogas production, increasing profitability.</li>
            <li>Investment Decisions: The insights serve as a tool for energy companies and investors to allocate resources efficiently for biogas production.</li>
            <li>Sustainable Energy Goals: Policymakers can use this approach for scalable, sustainable energy solutions, contributing to environmental conservation.</li>
        </ul>
        </p>
        <p>⏱ <strong>Future Work:</strong><br>
        While the models, especially LightGBM, displayed promising outcomes, there's scope for further refinement. Future studies might involve more advanced machine learning techniques or additional features to enhance accuracy and reliability.</p>
    </div>
</body>
</html>


<div style="text-align: center; padding: 60px; background: url('https://source.unsplash.com/800x600?energy') no-repeat center/cover; border: 5px solid #FFEB3B; border-radius: 35px; box-shadow: 0 15px 25px rgba(0, 0, 0, 0.2);">
    <p style="font-size: 26px; margin-bottom: 35px; color: #000000; font-family: 'Arial', sans-serif; font-weight: bold; text-transform: uppercase; letter-spacing: 3px; text-shadow: 4px 4px 8px rgba(0, 0, 0, 0.5); animation: slide 3s infinite alternate;">
        Click for the Next Analysis => U.S. Farm Biogas ML Prediction (Dairy Cow Farms)
    </p>
    <a href="https://www.kaggle.com/code/mehmetisik/u-s-farm-biogas-ml-prediction-dairy-cow-farms" target="_blank" style="text-decoration: none; display: inline-block; padding: 15px 30px; font-size: 24px; color: #673AB7; background-color: #FFFFFF; border-radius: 50px; transition: transform 0.3s ease; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.2);">👉</a>
    <style>
        @keyframes pulsate {
            0% { transform: scale(1); }
            50% { transform: scale(1.08); }
            100% { transform: scale(1); }
        }
        @keyframes slide {
            0% { transform: translateX(-10px); }
            100% { transform: translateX(10px); }
        }
        a:hover {
            transform: translateY(-5px);
            box-shadow: 0 8px 16px rgba(0, 0, 0, 0.3);
        }
    </style>
</div>