In [2]:
import pandas as pd
import os
from google.colab import drive
import pandas as pd
import numpy as np
import io
import base64
from PIL import Image
import os
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import cv2
# Mount Google Drive
drive.mount('/content/drive')
MY_DRIVE_PATH = "/content/drive/MyDrive/MLProject"
DATA_FOLDER = os.path.join(MY_DRIVE_PATH, 'Data google sheet')
REAL_CSV_FILE = os.path.join(DATA_FOLDER, 'Real_Fruits_Data.csv')
AI_CSV_FILE = os.path.join(DATA_FOLDER, 'AI_Fruits_Data.csv')
MERGED_CSV_FILE = os.path.join(DATA_FOLDER, 'Merged_Fruits_Data.csv')
PROCESSED_CSV_FILE = os.path.join(DATA_FOLDER, 'Processed_Fruits_Data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# merge the generated and real dataset with shuffle
def merge_and_shuffle():
    df_real = pd.read_csv(REAL_CSV_FILE, sep=';')
    df_ai = pd.read_csv(AI_CSV_FILE, sep=';')
    # Concatenate
    df_merged = pd.concat([df_real, df_ai], ignore_index=True)

    # Shuffle the dataset
    df_shuffled = df_merged.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Merged {len(df_real)} Real and {len(df_ai)} AI samples.")
    print(f"Total samples: {len(df_shuffled)}")

    # Save
    df_shuffled.to_csv(MERGED_CSV_FILE, index=False, sep=';', encoding='utf-8-sig')
    print(f"Saved merged dataset to: {MERGED_CSV_FILE}")
    return df_shuffled

# Execute
df = merge_and_shuffle()

Merged 586 Real and 2596 AI samples.
Total samples: 3182
Saved merged dataset to: /content/drive/MyDrive/MLProject/Data google sheet/Merged_Fruits_Data.csv


In [4]:
def convert_to_float(val):
    #Helper to convert '150,50' string to 150.50 float.
    if isinstance(val, str):
        val = val.replace(',', '.')
        return float(val)
    return float(val)

In [5]:
def min_max_scale(series):

    #(X - min) / (max - min)

    min_val = series.min()
    max_val = series.max()
    # Avoid division by zero if all values are the same
    if max_val - min_val == 0:
        return series - min_val

    return (series - min_val) / (max_val - min_val)

In [6]:
!pip install pillow_heif


Collecting pillow_heif
  Downloading pillow_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.6 kB)
Downloading pillow_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow_heif
Successfully installed pillow_heif-1.1.1


In [7]:
import pillow_heif
pillow_heif.register_heif_opener()

In [8]:
def extract_raw_hsv_histogram(image_path):
    try:
        if pd.isna(image_path) or not isinstance(image_path, str):
            return np.zeros(128)
        img = None
        lower_path = image_path.lower()

        # Handle HEIC
        if "heic" in lower_path or "heif" in lower_path:
            try:
                heif_file = pillow_heif.open_heif(image_path, convert_hdr_to_8bit=False, bgr_mode=True)
                img = np.asarray(heif_file)
            except:
                return np.zeros(128)
        # Handle JPG/PNG
        elif os.path.exists(image_path):
            img = cv2.imread(image_path)

        if img is None:
            return np.zeros(128)

        # Convert to HSV
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

        # Calculate Histogram (10 bins Hue, 8 bins Saturation)
        # Range: Hue [0, 180], Saturation [0, 256]
        hist = cv2.calcHist([hsv], [0, 1], None, [10, 8], [0, 180, 0, 256])

        # Flatten
        return hist.flatten()

    except Exception:
        return np.zeros(128)

In [9]:
#TEXT EMBEDDING EXTRACTION ---
def extract_text_embeddings(text_list):
    #Uses 'all-MiniLM-L6-v2' to convert text to 384-dim vectors.
    print("Loading SentenceTransformer model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')

    print(f"Encoding {len(text_list)} sentences...")
    embeddings = model.encode(text_list, show_progress_bar=True)
    return embeddings

In [10]:
def generate_raw_processed_dataset(load_path, save_path):
    print("STARTING PROCESSING (NON-NORMALIZED / EMBEDDINGS)")
    df = pd.read_csv(load_path, sep=';')

    # RAW IMAGE FEATURES
    print("\nExtracting Raw Image Features (HSV 10x8)...")
    tqdm.pandas()
    img_features_list = df['Image_path'].progress_apply(extract_raw_hsv_histogram).tolist()
    img_cols = [f'img_{i}' for i in range(80)]
    df_img = pd.DataFrame(img_features_list, columns=img_cols)

    # TEXT EMBEDDINGS (all-MiniLM-L6-v2)
    print("\nExtracting Text Embeddings...")
    embeddings = extract_text_embeddings(df['Text'].tolist())
    # Create columns text_0 ... text_383
    txt_cols = [f'text_{i}' for i in range(384)]
    df_txt = pd.DataFrame(embeddings, columns=txt_cols)

    # NUMERICAL & CATEGORICAL (RAW)
    print("\n Formatting Attributes (Raw)...")
    # Convert string numbers to float, but DO NOT MinMax Scale
    df['Weight'] = df['Weight'].apply(convert_to_float)
    df['Price'] = df['Price'].apply(convert_to_float)

    # Select columns to keep raw
    raw_cols = df[['Fruit', 'Label', 'Weight', 'Price', 'Season', 'Origin', 'Color', 'Text', 'Image_path']]

    # COMBINE ALL
    print("\n4. Combining Features...")
    df_final = pd.concat([raw_cols, df_img, df_txt], axis=1)

    print(f"   -> Final Shape: {df_final.shape}")
    print(f"   -> Image Features: 128 (Raw)")
    print(f"   -> Text Features: 384 (Embeddings)")
    print(f"   -> Attributes: Raw Values")

    df_final.to_csv(save_path, index=False, sep=';', encoding='utf-8-sig')
    print(f"✅ Saved to: {save_path}")



In [11]:
generate_raw_processed_dataset(MERGED_CSV_FILE, PROCESSED_CSV_FILE)

STARTING PROCESSING (NON-NORMALIZED / EMBEDDINGS)

Extracting Raw Image Features (HSV 10x8)...


100%|██████████| 3182/3182 [28:48<00:00,  1.84it/s]



Extracting Text Embeddings...
Loading SentenceTransformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding 3182 sentences...


Batches:   0%|          | 0/100 [00:00<?, ?it/s]


 Formatting Attributes (Raw)...

4. Combining Features...
   -> Final Shape: (3182, 473)
   -> Image Features: 128 (Raw)
   -> Text Features: 384 (Embeddings)
   -> Attributes: Raw Values
✅ Saved to: /content/drive/MyDrive/MLProject/Data google sheet/Processed_Fruits_Data.csv
