### 1.洗床数据清洗与整合

In [20]:
import pandas as pd
import re
import calendar
import os
from time import sleep
from tqdm import tqdm

# 初始化文件
if os.path.exists('RegenerationRawInfo.csv'):
    os.remove('RegenerationRawInfo.csv')

valid_columns = [2,5,8]
years = [2019,2020,2021,2022]

regeneration_df = pd.DataFrame({'date':[], 'regeneration':[]})
regeneration_df.to_csv("RegenerationRawInfo.csv",mode='a',index=False)


for year in tqdm(years,desc="year"):


    # 遍历路径文件
    for month in tqdm(range(1,13,1),desc="month"):
        file_name = '干熄焦生产记录台账/{0}/干熄焦生产记录台账（{1}月）.xlsx'.format(year,month)
        #file_date_info = re.findall(r'\d{1,}', file_name) # 提取路径文件信息(年、月）
        #month = file_date_info[1]
        # year = file_date_info[0]

        # 返回（当月总周数，当月总天数）
        month_range = calendar.monthrange(int(year),int(month))
        
        # 遍历表单
        for day in range(1,month_range[1]+1,1):  # 从第1天开始至当月最后一天
            try:
                regeneration_date = pd.read_excel(file_name, sheet_name = "{}日".format(day), header=None) # 读取当日表单
            except FileNotFoundError as file_error:
                print(file_error)
                break
                
            if (year < 2020 or ( year==2020 and month <= 9)):
                valid_index1 = 25
                valid_index2 = 26
            else :
                valid_index1 = 27
                valid_index2 = 28
                
            regeneration_cell = regeneration_date.iloc[valid_index1:valid_index2,[2,5,8]].astype(str) # 只保留包含洗床单元格的数据
            
            # 构建日期DataFrame
            regenerate_date = {'date':["{0}-{1}-{2}".format(year, month, day)]} # 再生日期信息
            regeneration_date_df = pd.DataFrame(data=regenerate_date)
        
            # 对单元格遍历，拆分、分割
            for valid_column in valid_columns:
                regeneration_split_cell = regeneration_cell[valid_column].T.str.split(r'\s+', expand=True) # 转置,（以至少1个空格)分割
            
                # 根据拆分的单元格长度，遍历追加入文件
                for l in range(int(regeneration_split_cell.size)):
                    regeneration_split_cell[l] =  regeneration_split_cell[l].astype(str)
                        
                    # 重建洗床信息DataFrame
                    regeneration_split_cell_value = regeneration_split_cell[l].values
                    regeneration_split_cell_value_df = pd.DataFrame(regeneration_split_cell_value)
                    frames = [regeneration_date_df, regeneration_split_cell_value_df]
                    result = pd.concat(frames,join="outer",axis=1, ignore_index=True)
                    result.to_csv("RegenerationRawInfo.csv",mode='a',header=False, index=False)

# 剔除空行
regeneration_raw_info = pd.read_csv("RegenerationRawInfo.csv")
regeneration_raw_info.dropna(axis='rows', inplace=True)
regeneration_raw_info.reset_index(drop=True, inplace=True)  # drop=True能避免把旧列插入重排列

regeneration_raw_info.to_csv("RegenerationRawInfo.csv",mode='w', index=False)

year:   0%|          | 0/4 [00:00<?, ?it/s]
month:   0%|          | 0/12 [00:00<?, ?it/s][A
month:   8%|▊         | 1/12 [00:04<00:45,  4.15s/it][A
month:  17%|█▋        | 2/12 [00:08<00:40,  4.01s/it][A
month:  25%|██▌       | 3/12 [00:13<00:42,  4.67s/it][A
month:  33%|███▎      | 4/12 [00:18<00:39,  4.91s/it][A
month:  42%|████▏     | 5/12 [00:24<00:35,  5.09s/it][A
month:  50%|█████     | 6/12 [00:29<00:30,  5.09s/it][A
month:  58%|█████▊    | 7/12 [00:34<00:25,  5.20s/it][A
month:  67%|██████▋   | 8/12 [00:40<00:20,  5.23s/it][A
month:  75%|███████▌  | 9/12 [00:44<00:15,  5.05s/it][A
month:  83%|████████▎ | 10/12 [00:49<00:10,  5.02s/it][A
month:  92%|█████████▏| 11/12 [00:54<00:04,  4.90s/it][A
month: 100%|██████████| 12/12 [00:59<00:00,  4.96s/it][A
year:  25%|██▌       | 1/4 [00:59<02:58, 59.51s/it]
month:   0%|          | 0/12 [00:00<?, ?it/s][A
month:   8%|▊         | 1/12 [00:05<01:02,  5.72s/it][A
month:  17%|█▋        | 2/12 [00:10<00:52,  5.26s/it][A
month

[Errno 2] No such file or directory: '干熄焦生产记录台账/2022/干熄焦生产记录台账（12月）.xlsx'





### 2.洗床数据分割

In [131]:
rg_info = pd.read_csv("RegenerationRawInfo.csv")
pd.to_datetime(rg_info.date)

rg_split = rg_info['regeneration'].str.split(r' ?分? ?再生|，|进|置换', regex=True, expand=True)

rg_split['time'] = rg_split[0]
rg_split['cation_resin'] = rg_split[1]

rg_split_draft = pd.concat([rg_info.date, rg_split['time'], rg_split['cation_resin']], axis=1)
rg_split_draft.replace("：",":", inplace=True)
rg_split_draft.to_csv("RegenerationSplitInfo.csv",index=False)

In [132]:
rg_draft = pd.read_csv("RegenerationSplitInfo.csv")

1#阳

In [133]:
cation_resin_1 = rg_draft.loc[rg_draft.iloc[:,2].str.contains(r'1#阳床',regex=True, na=False)]

# 规范数据格式
cation_resin_1 = cation_resin_1[cation_resin_1['cation_resin'].notnull()].copy()
cation_resin_1.replace(r'：',":", regex=True, inplace=True)
cation_resin_1.replace(r';',":", regex=True, inplace=True)
cation_resin_1.replace(r'点',":", regex=True, inplace=True)

# 统一时间格式
cation_resin_1['time'] = pd.to_datetime(cation_resin_1['date'] +" "+ cation_resin_1['time'])

# 洗床间隔时间
cation_resin_1['time_diff'] = pd.to_datetime(cation_resin_1['time']).diff()

cation_resin_1.to_csv("cation_resin_1.csv",index=False)

2#阳

In [134]:
cation_resin_2 = rg_draft.loc[rg_draft.iloc[:,2].str.contains(r'2#阳',regex=True, na=False)]

# 规范数据格式
cation_resin_2 = cation_resin_2[cation_resin_2['cation_resin'].notnull()].copy()
cation_resin_2.replace(r'：',":", regex=True, inplace=True)
cation_resin_2.replace(r';',":", regex=True, inplace=True)
cation_resin_2.replace(r'点',":", regex=True, inplace=True)
cation_resin_2.replace(r'\.',":", regex=True, inplace=True)
cation_resin_2.replace(r'::',":", regex=True, inplace=True)

# 时间标准化
cation_resin_2['time'] = pd.to_datetime(cation_resin_2['date'] +" "+ cation_resin_2['time'])

# 洗床间隔时间
cation_resin_2['time_diff'] = pd.to_datetime(cation_resin_2['time']).diff()

cation_resin_2.to_csv("cation_resin_2.csv",index=False)

3#阳

In [135]:
cation_resin_3 = rg_draft.loc[rg_draft.iloc[:,2].str.contains(r'3#阳',regex=True, na=False)]

# 规范数据格式
cation_resin_3 = cation_resin_3[cation_resin_3['cation_resin'].notnull()].copy()
cation_resin_3.replace(r'：',":", regex=True, inplace=True)
cation_resin_3.replace(r';',":", regex=True, inplace=True)
cation_resin_3.replace(r'点',":", regex=True, inplace=True)
cation_resin_3.replace(r'\.',":", regex=True, inplace=True)
cation_resin_3.replace(r'::',":", regex=True, inplace=True)

# 时间标准化
cation_resin_3['time'] = pd.to_datetime(cation_resin_3['date'] +" "+ cation_resin_3['time'])

# 洗床间隔时间
cation_resin_3['time_diff'] = pd.to_datetime(cation_resin_3['time']).diff()

cation_resin_3.to_csv("cation_resin_3.csv",index=False)

### 3.异常数据处理

2021-01-14 中间间隔143天40小时系1#阳床树脂泄漏维护，故剔除该行。

In [136]:
cation_resin_1.drop([cation_resin_1['time_diff'].idxmax()], axis=0, inplace=True)

#### 1#阳床异常数据处理：
1.删除使用时长大于60h，小于4h的数据。（床体维护）

2.大于50h的数据乘以0.9备用系数。（床体备用）

In [137]:
# 阳床间隔单位为小时
cation_resin_1['time_diff'] = (cation_resin_1['time_diff'].dt.total_seconds() / 60 / 60 ).round(2)
cation_resin_2['time_diff'] = (cation_resin_2['time_diff'].dt.total_seconds() / 60 / 60 ).round(2)
cation_resin_3['time_diff'] = (cation_resin_3['time_diff'].dt.total_seconds() / 60 / 60 ).round(2)

In [138]:
authentic_cs_1 = cation_resin_1.loc[(cation_resin_1['time_diff'] < 60) & (cation_resin_1['time_diff'] > 6)]
authentic_cs_1_td = authentic_cs_1['time_diff'].copy()
(authentic_cs_1_td.loc[(authentic_cs_1['time_diff'] > 50 )]) *= 0.9
authentic_cs_1_td.to_csv("authentic_cs_1.csv", index=False)

In [139]:
authentic_cs_1_td.describe()

count    772.000000
mean      31.271142
std       11.804803
min        6.500000
25%       21.500000
50%       31.600000
75%       40.980000
max       53.892000
Name: time_diff, dtype: float64

In [140]:
authentic_cs_2 = cation_resin_2.loc[(cation_resin_2['time_diff'] < 60) & (cation_resin_2['time_diff'] > 6)]
authentic_cs_2_td = authentic_cs_2['time_diff'].copy()
(authentic_cs_2_td.loc[(authentic_cs_2['time_diff'] > 50 )]) *= 0.9
authentic_cs_2_td.to_csv("authentic_cs_2.csv", index=False)

In [141]:
authentic_cs_2_td.describe()

count    809.000000
mean      31.815304
std       10.725194
min        7.000000
25%       23.500000
50%       31.970000
75%       39.800000
max       53.640000
Name: time_diff, dtype: float64

In [142]:
authentic_cs_3 = cation_resin_3.loc[(cation_resin_3['time_diff'] < 60) & (cation_resin_3['time_diff'] > 6)]
authentic_cs_3_td = authentic_cs_3['time_diff'].copy()
(authentic_cs_3_td.loc[(authentic_cs_3['time_diff'] > 50 )]) *= 0.9
authentic_cs_3_td.to_csv("authentic_cs_3.csv", index=False)

In [93]:
authentic_cs_3_td.describe()

count    588.000000
mean      34.404155
std       11.569159
min        6.080000
25%       25.327500
50%       34.050000
75%       45.450000
max       53.847000
Name: time_diff, dtype: float64