In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Parquet 파일 읽기
P_FG_ch1 = pd.read_parquet("./drystrip_dataset/P_FG_60S_1_ref")

# 불러오고 싶은 컬럼 리스트 정의
meta_colums = ['time', 'lotid', 'wafer_number', 'Recipe_Step_Num']
interested_sensors = ['APC_Position', 'APC_Pressure', 'Gas1_Monitor', 'Gas6_Monitor', 'Mat_Irms', 'Mat_Phase','Mat_Vrms',
                      'Mat_VC1_Position', 'Mat_VC2_Position', 'SourcePwr_Read', 'Temp', 'Wall_Temp_Monitor']

filterd_df = pd.concat([P_FG_ch1[['time', 'lotid', 'wafer_number', 'Recipe_Step_Num']],P_FG_ch1[interested_sensors]], axis=1)

In [None]:
# inference 데이터 특정 컬럼만 불러오기
P_FG_ch1_inf = pd.read_parquet("./drystrip_dataset/P_FG_60S_1_inf", columns=meta_colums+interested_sensors)

In [None]:
score_df = pd.read_parquet("../drystrip_dataset/score_results.parquet")
score_df = score_df[score_df['sensor'].isin(interested_sensors)]

# P_FG_60S_1 데이터만 필터링
target_score_df = score_df[(score_df['recipe']=='P_FG_60S') & (score_df['stage']=='1')]

target_score_df['lotid'] = target_score_df['fn'].apply(lambda x : '_'.join(x.split('_')[4:6]))
target_score_df['wafer_number'] = target_score_df['fn'].apply(lambda x : int(x.split('_')[-1][:-5]))
target_score_df.head()

In [None]:
merged_inf_score_df = pd.merge(P_FG_ch1_inf, target_score_df, on=['lotid', 'wafer_number'], how='inner')
unique_lot_wafer = merged_inf_score_df[['lotid', 'wafer_number']].drop_duplicates()
print('매칭된 총 웨이퍼 수',unique_lot_wafer.shape[0])

In [None]:
test_df = P_FG_ch1_inf[
    P_FG_ch1_inf.set_index(['lotid', 'wafer_number']).index.isin(
        unique_lot_wafer.set_index(['lotid', 'wafer_number']).index
    )
]

# reference 데이터 기간 늘리는 기준 선정

In [None]:
import pandas as pd

# 그룹 기준: wafer 단위
grouped = target_score_df.groupby(['lotid', 'wafer_number'])

# 조건 만족하는 wafer 추출
qualified_wafers = []

mask = target_score_df.groupby(['lotid', 'wafer_number'])['sensor_score'].transform(lambda x: (x >= 95).all())
qualified_df = target_score_df[mask]

# 결과 확인
print(f"모든 센서가 95점 이상인 웨이퍼 수: {qualified_df['fn'].nunique()}")

In [None]:
import matplotlib.pyplot as plt
date_df = qualified_df.copy()
date_df.drop_duplicates(subset=['lotid', 'wafer_number'], inplace=True)
date_df['date'] = date_df['time'].dt.date
date_df['date'].value_counts().sort_index().plot(kind='bar', figsize=(12,4))
plt.title("Wafer Count by Date : All sensor score >= 95")
plt.xlabel("Date")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates

# ----------- 첫 번째: qualified_df 기반 bar plot (빨간색) -----------
date_df1 = qualified_df.copy()
date_df1.drop_duplicates(subset=['lotid', 'wafer_number'], inplace=True)
date_df1['date'] = date_df1['time'].dt.date
daily_counts1 = date_df1['date'].value_counts().sort_index()
daily_counts1.index = pd.to_datetime(daily_counts1.index)

# ----------- 두 번째: target_score_df 기반 line plot (파란색) -----------
date_df2 = target_score_df.copy()
date_df2.drop_duplicates(subset=['lotid', 'wafer_number'], inplace=True)
date_df2['date'] = date_df2['time'].dt.date
daily_counts2 = date_df2['date'].value_counts().sort_index()

# 전체 날짜 범위 생성
full_date_range = pd.date_range(start=daily_counts2.index.min(),
                                end=daily_counts2.index.max())
full_date_index = full_date_range.date

# 두 집계 모두 누락 날짜 0으로 채우기
daily_counts1_full = pd.Series(0, index=full_date_index)
daily_counts2_full = pd.Series(0, index=full_date_index)
daily_counts1_full.update(daily_counts1)
daily_counts2_full.update(daily_counts2)
daily_counts1_full.index = pd.to_datetime(daily_counts1_full.index)
daily_counts2_full.index = pd.to_datetime(daily_counts2_full.index)

# ----------- 시각화 -----------
fig, ax = plt.subplots(figsize=(12, 4))

# 빨간색 bar plot
ax.plot(daily_counts1_full.index, daily_counts1_full.values, color='red', marker='o', markersize=2, linewidth=1, alpha=0.6, label='All sensor scores >=95')

# 파란색 line plot
ax.plot(daily_counts2_full.index, daily_counts2_full.values, color='blue', marker='o', markersize=2, linewidth=1, alpha=0.6, label='Total wafer')

# x축 포맷
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

plt.title("Wafer Count by Date")
plt.xlabel("Date")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# 조건: qualified_df에 있는 lotid-wafer_number 조합
qualified_pairs = qualified_df[['lotid', 'wafer_number']].drop_duplicates()

# P_FG_ch1_inf에서 일치하는 wafer만 필터링
matched_df = P_FG_ch1_inf.merge(qualified_pairs, on=['lotid', 'wafer_number'], how='inner')

# 결과 확인
print(f"매칭된 wafer 수: {matched_df[['lotid', 'wafer_number']].drop_duplicates().shape[0]}")
matched_df.head()

- test_df에서 ~2023-12-31 23:59:59 까지의 데이터를 train(reference)에 추가하여 사용
- test_df에서 2024-01-01 00:00:00 부터의 데이터를 test로 사용

In [None]:
date_df['date'] = pd.to_datetime(date_df['date'])
qualified_df = date_df[date_df['date'] < pd.Timestamp("2024-01-01 00:00:00")]

# 조건: qualified_df에 있는 lotid-wafer_number 조합
qualified_pairs = qualified_df[['lotid', 'wafer_number']].drop_duplicates()

# P_FG_ch1_inf에서 일치하는 wafer만 필터링
matched_df = P_FG_ch1_inf.merge(qualified_pairs, on=['lotid', 'wafer_number'], how='inner')

# 결과 확인
print(f"매칭된 wafer 수: {matched_df[['lotid', 'wafer_number']].drop_duplicates().shape[0]}")

In [None]:
bigger_train_df = pd.concat([filterd_df, matched_df], ignore_index=True)
bigger_train_df['Lotid_wafer'] = bigger_train_df['lotid'] + '_' + bigger_train_df['wafer_number'].astype(str)
ids = bigger_train_df['Lotid_wafer'].unique()
np.save('big_ref_lotid_wafer_ids.npy', ids)
bigger_train_df.to_parquet("bigger_train_df.parquet", index=False)

In [None]:
print('총 reference 웨이퍼 수:',bigger_train_df[['lotid', 'wafer_number']].drop_duplicates().shape[0])

In [None]:
test_df = test_df.copy()
test_df['time'] = pd.to_datetime(test_df['time'])

# 필터링
filtered_test_df = test_df[test_df['time'] >= pd.Timestamp("2024-01-01 00:00:00", tz='UTC')]

print('총 test 웨이퍼 수:',filtered_test_df[['lotid', 'wafer_number']].drop_duplicates().shape[0])

## 웨이퍼 데이터 준비

In [None]:
def process_and_save_full_sequence(groups, output_folder, scaler):
    """
    각 웨이퍼를 하나의 시계열로 저장 (VTT 전체 시계열 입력용)
    - 입력(x): (T, D)
    - step_num: (T,)
    - 저장 파일 하나당 하나의 시계열 샘플 포함
    """
    os.makedirs(output_folder, exist_ok=True)
    for (lotid, wafer_number), sensor in tqdm(groups, total=len(groups), desc=f"Processing {output_folder}"):
        # 정렬 및 정규화
        sensor = sensor.sort_values('time').reset_index(drop=True)
        sensor_values = sensor.iloc[:, 4:].values  # → (T, D)
        scaled_values = scaler.transform(sensor_values)
        total_len = scaled_values.shape[0]

        step_array = sensor['Recipe_Step_Num'].values  # (T,)

        # 저장
        file_path = os.path.join(output_folder, f"{lotid}_{wafer_number}.h5")
        with h5py.File(file_path, 'w') as hf:
            hf.create_dataset('data', data=scaled_values, compression='gzip')               # (T, D)
            hf.create_dataset('labels', data=np.array([0]), compression='gzip')             # dummy label
            hf.create_dataset('lotids', data=np.array([lotid]).astype('S'), compression='gzip')
            hf.create_dataset('wafer_numbers', data=np.array([wafer_number]).astype('S'), compression='gzip')
            hf.create_dataset('step_num', data=step_array, compression='gzip')              # (T,)

In [None]:
scaler = MinMaxScaler()
scaler.fit(bigger_train_df.iloc[:, 4:].values.astype(float))

# 웨이퍼 단위 groupby
train_groups = bigger_train_df.groupby(["lotid", "wafer_number"])
test_groups = filtered_test_df.groupby(["lotid", "wafer_number"])

# 웨이퍼 단위로 시계열을 한 window로 저장
process_and_save_full_sequence(train_groups, "./data/all_step/train", scaler)
process_and_save_full_sequence(test_groups, "./data/all_step/test", scaler)