In [127]:
import os
import re
import pathlib
import logging
import logging.config

import pandas as pd

import my_logger

logging.config.dictConfig(my_logger.dic_log_cfg)
logger = logging.getLogger(__name__)

In [128]:
import gen_data
# gen_data.gen_data(2000, 2000, "Test_Stage_firstlot")
# gen_data.gen_data(1200, 1200, "Test_Stage_exist")
gen_data.gen_data(1000, 500, "Test_Stage/result")
gen_data.gen_data(1500, 500, "Test_Stage_test_cover/result")

In [129]:
# Setting
STAGE = "Test_Stage"
LAYER = "Test_Layer"
CODE_NAME = "Test_Cover"

TRAINING_DATA = "training_data.csv"
TESTING_DATA = "testing_data.csv"
_TIME = "_time"
DATE = "2024/07/01 00:00:00"

fix_cols = ["Part", "Tool", "Prev1Tool"]
soc_part_cols = ["value\n1", "value\n2", "value\n3"]

lst_soc_ratio = [1, 1, 150, 150, 150, 150, 15, 15, 15, 15]
lst_soc_spec = [2, 2, 0.015, 0.015, 0.015, 0.015, 0.15, 0.15, 0.15, 0.15]

In [130]:
_concat = lambda x: "_".join(x)

def _check_dir(stage: str, code_name: str):
    old_dir_name = f'{stage}/result'
    new_dir_name = f'{stage}_{code_name}/result'
    if not pathlib.Path(old_dir_name).is_dir():
        raise LookupError(f'Folder {old_dir_name} not found')
    if not pathlib.Path(new_dir_name).is_dir():
        raise LookupError(f'Folder {new_dir_name} not found')
    logger.debug("Check Folder OK")

def _check_runtable(stage: str, code_name: str):
    old_run_name = f'{stage}/result/runtable_0.csv'
    new_run_name = f'{stage}_{code_name}/result/runtable_0.csv'
    if not pathlib.Path(old_run_name).is_file():
        raise LookupError(f'File {old_run_name} not found')
    if not pathlib.Path(new_run_name).is_file():
        raise LookupError(f'File {new_run_name} not found')
    logger.debug("Check Runtable OK")
    
def _check_file(training_data: str, testint_data: str):
    if not pathlib.Path(training_data).is_file():
        raise LookupError(f'File {training_data} not found')
    if not pathlib.Path(testint_data).is_file():
        raise LookupError(f'File {testint_data} not found')
    logger.debug("Check File OK")

def check_file(param: dict):
    _check_dir(param["STAGE"], param["CODE_NAME"])
    _check_runtable(param["STAGE"], param["CODE_NAME"])
    _check_file(param["TRAINING_DATA"], param["TESTING_DATA"])

In [131]:
def _concat_csv(lst_file: list, dir_name: str) ->pd.DataFrame:
    df_res = pd.DataFrame()
    for _file in lst_file:
        df_file = pd.read_csv(f'{dir_name}/{_file}')
        df_res = pd.concat([df_res, df_file])
    logger.info(f'Total {df_res.shape[0]} runtable in {dir_name}')
    return df_res

def _get_file_list(dir_name: str, file_type: str) -> list:
    chk_run = re.compile("runtable_\d+.csv")
    lst_file = [str(_file.name) for _file in list(pathlib.Path(dir_name).glob(f'*.{file_type}'))]
    lst_res = [_file for _file in lst_file if chk_run.match(_file)]
    logger.info(f'Found {len(lst_res)} csv in {dir_name}')
    return lst_res

def concat_run(param: dict):
    os.makedirs("_data", exist_ok=True)
    stage, code_name = param["STAGE"], param["CODE_NAME"]
    old_dir = f'{stage}/result'
    new_dir = f'{stage}_{code_name}/result'
    lst_old_file = _get_file_list(old_dir , "csv")
    lst_new_file = _get_file_list(new_dir , "csv")
    df_old_run = _concat_csv(lst_old_file, old_dir)
    df_new_run = _concat_csv(lst_new_file, new_dir)
    df_old_run.to_csv(f'_data/{stage}_run.csv', index=False)
    df_new_run.to_csv(f'_data/{stage}_{code_name}_run.csv', index=False)
    return df_old_run, df_new_run

In [132]:
def cal_run_diff(df_old_run, df_new_run, soc_part_cols, lst_soc_ratio):
    suffix = ["_old", "_new"]
    _df = pd.merge(df_old_run, df_new_run, how='inner',  on=fix_cols, suffixes=suffix)
    
    for col, _ratio in zip(soc_part_cols, lst_soc_ratio):
        _col = col.replace("\n", " ")
        _df[col+"diff"] = _df[col+suffix[0]] - _df[col+suffix[1]]
        _df[col+"mae"] = abs(_df[col+"diff"])
        _mae = _df[col+"mae"].mean()
        logger.debug(f'{_col} MAE: {_mae:.6f}, in nm: {_mae*_ratio:.6f}')
        
def validation_ocap(df, lst_soc_spec):
    _ocap = False
    for col, spec in zip(soc_part_cols, lst_soc_spec):
        if abs(df[f'{col}_Rn'] - df[f'{col}_ML']) > spec:
            _ocap = True
    return _ocap

In [133]:
param = {
    "STAGE": STAGE, 
    "LAYER": LAYER,
    "CODE_NAME": CODE_NAME,
    "TRAINING_DATA": TRAINING_DATA,
    "TESTING_DATA": TESTING_DATA,
    }


check_file(param)
df_train = pd.read_csv(f'./{TRAINING_DATA}')
df_test = pd.read_csv(f'./{TESTING_DATA}')
df_old_run, df_new_run = concat_run(param)
df_old_run.loc[:, "COMB"] = df_old_run[fix_cols].apply(_concat, axis=1)
df_new_run.loc[:, "COMB"] = df_new_run[fix_cols].apply(_concat, axis=1)
logger.info(f'Training Data: {df_train.shape}')
logger.info(f'Testing Data: {df_test.shape}')
logger.info(f'Old Runtable: {df_old_run.shape}')
logger.info(f'New Runtable: {df_new_run.shape}')

cal_run_diff(df_old_run, df_new_run, soc_part_cols, lst_soc_ratio)

2024-07-22 00:51:55 [D][636599369.py][_check_dir:  10] Check Folder OK
2024-07-22 00:51:55 [D][636599369.py][_check_runtable:  19] Check Runtable OK
2024-07-22 00:51:55 [D][636599369.py][_check_file:  26] Check File OK
2024-07-22 00:51:55 [I][1200119199.py][_get_file_list:  13] Found 2 csv in Test_Stage/result
2024-07-22 00:51:55 [I][1200119199.py][_get_file_list:  13] Found 3 csv in Test_Stage_Test_Cover/result
2024-07-22 00:51:55 [I][1200119199.py][_concat_csv:   6] Total 1000 runtable in Test_Stage/result
2024-07-22 00:51:55 [I][1200119199.py][_concat_csv:   6] Total 1500 runtable in Test_Stage_Test_Cover/result
2024-07-22 00:51:55 [I][947676554.py][  <module>:  16] Training Data: (959, 7)
2024-07-22 00:51:55 [I][947676554.py][  <module>:  17] Testing Data: (2000, 7)
2024-07-22 00:51:55 [I][947676554.py][  <module>:  18] Old Runtable: (1000, 7)
2024-07-22 00:51:55 [I][947676554.py][  <module>:  19] New Runtable: (1500, 7)
2024-07-22 00:51:55 [D][1515922192.py][cal_run_diff:  10] val

In [134]:
# Preprocess Training/Testing Data
df_test[_TIME] = pd.to_datetime(df_test[_TIME])
df_test = df_test.sort_values(by=_TIME, ascending=True)
cond_before_date = df_test[_TIME] < pd.to_datetime(DATE)

df_test_before = df_test[cond_before_date]
df_test_after = df_test[~cond_before_date]

df_train_comb = df_train.drop_duplicates(subset=fix_cols, keep="first")
df_test_before_comb = df_test_before.drop_duplicates(subset=fix_cols, keep="first")
df_test_after_comb = df_test_after.drop_duplicates(subset=fix_cols, keep="first")

logger.info(f'df_train: {df_train.shape}, with {df_train_comb.shape[0]} runtable')
logger.info(f'df_test_before: {df_test_before.shape}, with {df_test_before_comb.shape[0]} runtable')
logger.info(f'df_test_after: {df_test_after.shape}, with {df_test_after_comb.shape[0]} runtable')

# Runtable in Training Data and Testing Data
lst_known_run = list(df_train_comb[fix_cols].apply(_concat, axis=1))
lst_known_run.extend(list(df_test_before_comb[fix_cols].apply(_concat, axis=1)))
lst_known_run = list(set(lst_known_run))

# FirstLot Runtable 
df_test_after_comb["COMB"] = df_test_after_comb[fix_cols].apply(_concat, axis=1)
cond_firstlot = df_test_after_comb["COMB"].isin(lst_known_run)
df_firstlot = df_test_after_comb[cond_firstlot]
logger.info(f'df_firstlot: {df_firstlot.shape}')

# Compute Result
df_old_result = pd.merge(df_firstlot, df_old_run, how="inner", on=fix_cols, suffixes=["_Rn", "_ML"])
df_new_result = pd.merge(df_firstlot, df_new_run, how="inner", on=fix_cols, suffixes=["_Rn", "_ML"])

df_old_result["OCAP"] = df_old_result.apply(validation_ocap, args=(lst_soc_spec,) ,axis=1)
df_new_result["OCAP"] = df_new_result.apply(validation_ocap, args=(lst_soc_spec,) ,axis=1)

cond_old_success = df_old_result["OCAP"] == False
cond_new_success = df_new_result["OCAP"] == False

old_cr = 100 * df_old_result.shape[0] / df_firstlot.shape[0]
new_cr = 100 * df_new_result.shape[0] / df_firstlot.shape[0]
old_sr = 100 * df_old_result[cond_old_success].shape[0] / df_old_result.shape[0]
new_sr = 100 * df_new_result[cond_new_success].shape[0] / df_new_result.shape[0]

2024-07-22 00:51:55 [I][946419373.py][  <module>:  13] df_train: (959, 7), with 901 runtable
2024-07-22 00:51:55 [I][946419373.py][  <module>:  14] df_test_before: (959, 7), with 897 runtable
2024-07-22 00:51:55 [I][946419373.py][  <module>:  15] df_test_after: (1041, 7), with 983 runtable
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_after_comb["COMB"] = df_test_after_comb[fix_cols].apply(_concat, axis=1)
2024-07-22 00:51:55 [I][946419373.py][  <module>:  26] df_firstlot: (216, 8)


In [135]:
logger.info("Coverage Rate")
logger.info(f'Old Coverage Rate: {old_cr:.2f}% [{df_old_result.shape[0]}/{df_firstlot.shape[0]}]')
logger.info(f'New Coverage Rate: {new_cr:.2f}% [{df_new_result.shape[0]}/{df_firstlot.shape[0]}]')

logger.info("Success Rate")
logger.info(f'Old Success Rate: {old_sr:.2f}% [{df_old_result[cond_old_success].shape[0]}/{df_old_result.shape[0]}]')
logger.info(f'New Success Rate: {new_sr:.2f}% [{df_new_result[cond_new_success].shape[0]}/{df_new_result.shape[0]}]')

dic_result = {
    "Success": [df_old_result[cond_old_success].shape[0], df_new_result[cond_new_success].shape[0]],
    "Total": [df_old_result.shape[0], df_new_result.shape[0]],
    "Success Rate": [old_sr, new_sr],
    "Coverage Rate": [old_cr, new_cr],
}
pd.DataFrame(dic_result, index=["old", param["CODE_NAME"]]).to_csv("./result.csv")

2024-07-22 00:51:55 [I][1222453530.py][  <module>:   1] Coverage Rate
2024-07-22 00:51:55 [I][1222453530.py][  <module>:   2] Old Coverage Rate: 15.74% [34/216]
2024-07-22 00:51:55 [I][1222453530.py][  <module>:   3] New Coverage Rate: 15.28% [33/216]
2024-07-22 00:51:55 [I][1222453530.py][  <module>:   5] Success Rate
2024-07-22 00:51:55 [I][1222453530.py][  <module>:   6] Old Success Rate: 26.47% [9/34]
2024-07-22 00:51:55 [I][1222453530.py][  <module>:   7] New Success Rate: 12.12% [4/33]
