In [None]:
import random
import os
import pandas as pd
import cv2
import numpy as np

In [None]:
def create_random_combinations(df, num_combinations):
    ids = df['id'].tolist()
    combinations = set()
    
    while len(combinations) < num_combinations:
        id1, id2 = random.sample(ids, 2)
        ratio = 0.4
        
        # 중복 조합 방지 (id1, id2)와 (id2, id1)가 동일한 것으로 간주
        if (id1, id2) not in combinations and (id2, id1) not in combinations:
            combinations.add((id1, id2, ratio))
    
    return list(combinations)

In [None]:
def blend_images(image_path1, image_path2, ratio):
    image1 = cv2.imread(image_path1)
    image2 = cv2.imread(image_path2)

    if image1 is None or image2 is None:
        raise FileNotFoundError(f"이미지 파일을 찾을 수 없습니다: {image_path1} 또는 {image_path2}")

    if image1.shape != image2.shape:
        raise ValueError("이미지 크기가 다릅니다. 동일한 크기의 이미지를 사용해주세요.")

    blended_image = cv2.addWeighted(image1, ratio, image2, 1 - ratio, 0)

    return blended_image

def cut_and_paste_images(image_path1, image_path2, ratio):
    image1 = cv2.imread(image_path1)
    image2 = cv2.imread(image_path2)

    if image1 is None or image2 is None:
        raise FileNotFoundError(f"이미지 파일을 찾을 수 없습니다: {image_path1} 또는 {image_path2}")

    if image1.shape != image2.shape:
        raise ValueError("이미지 크기가 다릅니다. 동일한 크기의 이미지를 사용해주세요.")

    height, width, _ = image1.shape
    cut_height = int(height * ratio)
    cut_width = int(width * ratio)

    # 랜덤 좌표 생성 (자를 부분의 시작 좌표)
    start_y = random.randint(0, height - cut_height)
    start_x = random.randint(0, width - cut_width)

    # image1의 랜덤 부분을 자릅니다.
    cut_part = image1[start_y:start_y + cut_height, start_x:start_x + cut_width]

    # image2의 동일 위치에 붙여넣습니다.
    combined_image = image2.copy()
    combined_image[start_y:start_y + cut_height, start_x:start_x + cut_width] = cut_part

    return combined_image


In [None]:
def adjust_features(df, id1, id2, ratio, new_id):
    row1 = df[df['id'] == id1]
    row2 = df[df['id'] == id2]
    
    if row1.empty or row2.empty:
        raise ValueError("ID가 데이터프레임에 존재하지 않습니다.")
    
    adjusted_row = row1.iloc[0] * ratio + row2.iloc[0] * (1 - ratio)
    adjusted_row['id'] = new_id  # 새로운 ID 부여
    
    return adjusted_row

def generate_combined_dataset(df, image_combinations, image_dir):
    combined_rows = []
    new_id = 1  # 새로운 ID 시작값
    
    for idx, (id1, id2, ratio) in enumerate(image_combinations, start=1):
        try:
            image_path1 = os.path.join(image_dir, f'{id1}.jpeg')
            image_path2 = os.path.join(image_dir, f'{id2}.jpeg')
            
            combined_image = blend_images(image_path1, image_path2, ratio)
            combined_image_path = os.path.join(image_dir, f'{new_id}.jpeg')
            cv2.imwrite(combined_image_path, combined_image)
            
            adjusted_row = adjust_features(df, id1, id2, ratio, new_id)
            combined_rows.append(adjusted_row)
            
            print(f'{idx}/{len(image_combinations)} 번째 조합 생성 완료: {new_id}')
            new_id += 1  # ID 증가
        except (FileNotFoundError, ValueError) as e:
            print(e)
    
    combined_df = pd.DataFrame(combined_rows)
    return combined_df


In [None]:
train_csv_path = '/mnt/ljh/planttraits2024/train.csv'
train = pd.read_csv(train_csv_path)

TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
for column in TARGET_COLUMNS:
    lower_quantile = train[column].quantile(0.005)
    upper_quantile = train[column].quantile(0.985)  
    train = train[(train[column] >= lower_quantile) & (train[column] <= upper_quantile)]


num_combinations = 25000
random_combinations = create_random_combinations(train, num_combinations)

image_dir = '/mnt/ljh/planttraits2024/train_images/'
combined_df = generate_combined_dataset(train, random_combinations, image_dir)

new_train_df = pd.concat([train, combined_df], ignore_index=True)

new_train_csv_path = '/mnt/ljh/planttraits2024/updated_train.csv'
new_train_df.to_csv(new_train_csv_path, index=False)