In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
path = "data/kamp"
month = [9, 10]
day = range(1, 32)
address = [os.path.join(path,f"kemp-abh-sensor-2021.{m:02d}.{d:02d}.csv") for m in month for d in day]

In [3]:
dataframes = []
for file_path in address:
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df["Date"] = file_path[-9:-4]
        del df["Index"]
        dataframes.append(df)

# 모든 DataFrame을 하나로 합치고 싶다면
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
combined_df

Unnamed: 0,Lot,Time,pH,Temp,Current,Date
0,1,오후 4:29:15.0,10.37,42.87,7.34,09.06
1,1,오후 4:29:20.0,10.90,42.70,8.26,09.06
2,1,오후 4:29:25.0,9.61,42.37,8.46,09.06
3,1,오후 4:29:30.0,10.32,45.85,8.04,09.06
4,1,오후 4:29:35.0,10.36,44.18,7.23,09.06
...,...,...,...,...,...,...
50089,22,오후 6:39:05.9,9.79,43.52,8.55,10.27
50090,22,오후 6:39:10.9,9.53,44.44,7.36,10.27
50091,22,오후 6:39:15.9,9.86,46.54,8.15,10.27
50092,22,오후 6:39:20.9,10.52,47.88,7.97,10.27


In [4]:
specific_dates = ['09.06', '09.07', '09.08', '09.09', '09.10']
filtered_df = combined_df[combined_df['Date'].isin(specific_dates)]

In [7]:
import pandas as pd
import numpy as np

def create_timeseries_windows(df, 
                               feature_columns=['pH', 'Temp', 'Current'], 
                               group_columns=['Lot', 'Date'], 
                               sequence_length=69):
    """
    시계열 윈도우 생성 함수
    
    Parameters:
    - df: 원본 데이터프레임
    - feature_columns: 특성으로 사용할 컬럼들
    - group_columns: 그룹화할 컬럼들
    - sequence_length: 시퀀스 길이
    
    Returns:
    - timeseries_windows: 시계열 윈도우 데이터
    """
    
    # 그룹별로 데이터 분할
    grouped = df.groupby(group_columns)
    
    timeseries_windows = []
    
    for group_key, group_data in grouped:
        # 데이터 시간 순서로 정렬 (시간 인덱스가 있다고 가정)
        group_sorted = group_data.sort_values('Time')
        
        # 그룹의 길이가 sequence_length 이상인 경우에만 처리
        if len(group_sorted) >= sequence_length:
            # 첫 번째 sequence만 추출
            sequence = group_sorted[feature_columns].iloc[:sequence_length].values
            
            # 그룹 정보와 함께 저장
            window_info = {
                'Lot': group_key[0],
                'Date': group_key[1],
                'Sequence': sequence,
                'Sequence_Start_Index': 0,
                'Sequence_End_Index': sequence_length - 1
            }
            
            timeseries_windows.append(window_info)
    
    return timeseries_windows

# 사용 예시
timeseries_windows = create_timeseries_windows(filtered_df)

# 결과 확인
print("Total number of windows:", len(timeseries_windows))

# 첫 번째 윈도우 정보 출력
first_window = timeseries_windows[0]
print("\n첫 번째 윈도우 정보:")

print("Lot:", first_window['Lot'])
print("Date:", first_window['Date'])
print("Sequence Shape:", first_window['Sequence'].shape)
print("Start Index:", first_window['Sequence_Start_Index'])
print("End Index:", first_window['Sequence_End_Index'])

Total number of windows: 110

첫 번째 윈도우 정보:
Lot: 1
Date: 09.06
Sequence Shape: (69, 3)
Start Index: 0
End Index: 68


In [6]:
error_lot_df = pd.read_csv(path +"/Error Lot list.csv")
def add_error_lot_labels(original_df, error_lot_df):
    # 에러 Lot과 날짜 정보를 결합한 데이터셋 생성
    error_lot_keys = set(zip(error_lot_df['LoT'], error_lot_df['Date']))
    error_lot_keys.update(zip(error_lot_df['LoT2'], error_lot_df['Date']))
    
    # 라벨 생성 (에러 Lot과 날짜가 모두 일치하면 1, 아니면 0)
    def check_error_label(row):
        return 1 if (row['Lot'], row['Date']) in error_lot_keys else 0
    
    original_df['Error_Label'] = original_df.apply(check_error_label, axis=1)
    
    return original_df

# 원본 데이터에 라벨 추가
labeled_df = add_error_lot_labels(filtered_df, error_lot_df)

# 라벨 분포 확인
print("Error Label 분포:")
print(labeled_df['Error_Label'].value_counts(normalize=True))

# 에러 Lot과 날짜 목록 확인
print("\n에러 Lot과 날짜 목록:")
error_lot_details = error_lot_df[['Date', 'LoT', 'LoT2']].dropna()
print(error_lot_details)

Error Label 분포:
Error_Label
0    1.0
Name: proportion, dtype: float64

에러 Lot과 날짜 목록:
          Date  LoT  LoT2
2   2021-09-08  1.0  22.0
25  2021-10-18  5.0  22.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df['Error_Label'] = original_df.apply(check_error_label, axis=1)


In [7]:
labeled_df

Unnamed: 0,Lot,Time,pH,Temp,Current,Date,Error_Label
0,1,오후 4:29:15.0,10.37,42.87,7.34,09.06,0
1,1,오후 4:29:20.0,10.90,42.70,8.26,09.06,0
2,1,오후 4:29:25.0,9.61,42.37,8.46,09.06,0
3,1,오후 4:29:30.0,10.32,45.85,8.04,09.06,0
4,1,오후 4:29:35.0,10.36,44.18,7.23,09.06,0
...,...,...,...,...,...,...,...
7585,22,오후 6:36:13.5,10.01,47.94,7.25,09.10,0
7586,22,오후 6:36:18.5,10.98,42.90,8.18,09.10,0
7587,22,오후 6:36:23.5,10.98,43.98,7.05,09.10,0
7588,22,오후 6:36:28.5,9.67,44.53,7.79,09.10,0


In [8]:
import pandas as pd
import numpy as np

def create_timeseries_windows(df, 
                               feature_columns=['pH', 'Temp', 'Current'], 
                               group_columns=['Lot', 'Date'], 
                               sequence_length=69):
    """
    시계열 윈도우 생성 함수
    
    Parameters:
    - df: 원본 데이터프레임
    - feature_columns: 특성으로 사용할 컬럼들
    - group_columns: 그룹화할 컬럼들
    - sequence_length: 시퀀스 길이
    
    Returns:
    - timeseries_windows: 시계열 윈도우 데이터
    """
    
    # 그룹별로 데이터 분할
    grouped = df.groupby(group_columns)
    
    timeseries_windows = []
    
    for group_key, group_data in grouped:
        # 데이터 시간 순서로 정렬 (시간 인덱스가 있다고 가정)
        group_sorted = group_data.sort_values('Time')
        
        # 그룹의 길이가 sequence_length 이상인 경우에만 처리
        if len(group_sorted) >= sequence_length:
            # 각 특성별 시퀀스 추출
            pH_window = group_sorted['pH'].iloc[:sequence_length].values
            Temp_window = group_sorted['Temp'].iloc[:sequence_length].values
            Current_window = group_sorted['Current'].iloc[:sequence_length].values
            
            # 에러 레이블 (첫 번째 레이블 사용)
            error_label = group_sorted['Error_Label'].iloc[0]
            
            # 원하는 데이터 구조로 변환
            window_info = [
                pH_window,
                Temp_window,
                Current_window,
                error_label
            ]
            
            timeseries_windows.append(window_info)
    
    return timeseries_windows

# 사용 예시
timeseries_windows = create_timeseries_windows(labeled_df)

# 결과 확인
print("Total number of windows:", len(timeseries_windows))

# 첫 번째 윈도우 정보 출력
first_window = timeseries_windows[0]
print("\n첫 번째 윈도우 정보:")
print("pH window shape:", first_window[0].shape)
print("Temp window shape:", first_window[1].shape)
print("Current window shape:", first_window[2].shape)
print("Error Label:", first_window[3])

# 추가 분석을 위한 예시
def analyze_windows(timeseries_windows):
    # 에러 레이블별 윈도우 개수
    error_counts = {0: 0, 1: 0}
    for window in timeseries_windows:
        error_counts[window[3]] += 1
    
    print("\n에러 레이블 분포:")
    for label, count in error_counts.items():
        print(f"Label {label}: {count} ({count/len(timeseries_windows)*100:.2f}%)")
    
    return error_counts

# 윈도우 분석
error_distribution = analyze_windows(timeseries_windows)

Total number of windows: 110

첫 번째 윈도우 정보:
pH window shape: (69,)
Temp window shape: (69,)
Current window shape: (69,)
Error Label: 0

에러 레이블 분포:
Label 0: 110 (100.00%)
Label 1: 0 (0.00%)


In [9]:
def prepare_ml_dataset(timeseries_windows):
    """
    시계열 윈도우 데이터를 머신러닝/딥러닝 모델 학습용 X, y로 변환
    
    Parameters:
    - timeseries_windows: create_timeseries_windows 함수로 생성된 윈도우 데이터
    
    Returns:
    - X: 특성 데이터 (shape: [n_samples, 3, sequence_length])
    - y: 레이블 데이터 (shape: [n_samples])
    """
    # 데이터 초기화
    X = []
    y = []
    
    # 각 윈도우 순회
    for window in timeseries_windows:
        # X에 pH, Temp, Current 추가 
        # 차원: [3, sequence_length]
        sample_x = [
            window[0],  # pH window
            window[1],  # Temp window
            window[2]   # Current window
        ]
        
        # y에 error label 추가
        sample_y = window[3]
        
        X.append(sample_x)
        y.append(sample_y)
    
    # numpy 배열로 변환
    X = np.array(X)
    y = np.array(y)
    
    return X, y

# 데이터 준비
X, y = prepare_ml_dataset(timeseries_windows)

# 데이터 shape 확인
print("X shape:", X.shape)
print("y shape:", y.shape)

# 클래스 분포 확인
unique, counts = np.unique(y, return_counts=True)
print("\n클래스 분포:")
for label, count in zip(unique, counts):
    print(f"Label {label}: {count} ({count/len(y)*100:.2f}%)")

X shape: (110, 3, 69)
y shape: (110,)

클래스 분포:
Label 0: 110 (100.00%)


In [15]:
# 데이터 평탄화
from sklearn.model_selection import train_test_split
X_train,y_train,X_test, y_test = train_test_split(X,y,random_state=42)
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Assuming X and y are already defined

# First, check the shape of your original data
print("Original X shape:", X.shape)

# Split the data BEFORE reshaping to ensure consistency
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print("X_train shape before flattening:", X_train.shape)
print("X_test shape before flattening:", X_test.shape)

# Flatten the data if it's 3D or higher
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)
print("X_train_flat shape:", X_train_flat.shape)
print("X_test_flat shape:", X_test_flat.shape)

# Create a NEW scaler instance and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_flat)
X_test_scaled = scaler.transform(X_test_flat)

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42,bootstrap = True)
rf_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

Original X shape: (110, 3, 69)
X_train shape before flattening: (82, 3, 69)
X_test shape before flattening: (28, 3, 69)
X_train_flat shape: (82, 207)
X_test_flat shape: (28, 207)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28

    accuracy                           1.00        28
   macro avg       1.00      1.00      1.00        28
weighted avg       1.00      1.00      1.00        28



In [7]:
numeric_col = ["Lot","pH","Temp","Current"]

In [8]:
error_drop = error.dropna(thresh=3, axis=0)
lot_process_lists = error_drop['LoT'].unique()
d_process_lists = error_drop['Date'].unique()
print("Unique LoT List : ", lot_process_lists)
print("Unique Date List : ", d_process_lists)

Unique LoT List :  [13.  1.  5.  9.  8. 17. 19. 14. 15.]
Unique Date List :  ['2021-09-07' '2021-09-08' '2021-09-14' '2021-09-15' '2021-09-23'
 '2021-09-29' '2021-10-01' '2021-10-05' '2021-10-06' '2021-10-08'
 '2021-10-18' '2021-10-25' '2021-10-26']
