In [None]:
#Install the library for HEIC images
!pip install pillow-heif

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Collecting pillow-heif
  Downloading pillow_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.6 kB)
Downloading pillow_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow-heif
Successfully installed pillow-heif-1.1.1
Mounted at /content/drive


In [None]:
import os
import pandas as pd
from PIL import Image
import base64
import io
import random
import pillow_heif
import re
import statistics
import numpy as np
import cv2

In [None]:
pillow_heif.register_heif_opener()
# paths for the drive folders
MY_DRIVE_PATH = "/content/drive/MyDrive/MLProject"
IMAGES_ROOT = os.path.join(MY_DRIVE_PATH, 'Images')
DATA_FOLDER = os.path.join(MY_DRIVE_PATH, 'Data google sheet')
REAL_CSV_FILE = os.path.join(DATA_FOLDER, 'Real_Fruits_Data.csv')
AI_CSV_FILE = os.path.join(DATA_FOLDER, 'AI_Fruits_Data.csv')

In [None]:
# these templates are written for the creation of text feture, they are randomly assigned.
GENERIC_TEMPLATES = [
    "Fresh {season} harvest sourced directly from {origin}.",
    "This {color_lower} item is a great deal at {price} TL.",
    "Weighing {weight}g, this premium produce is perfect for the {season}.",
    "Carefully hand-picked in {origin}. Enjoy the natural {color_lower} color.",
    "A {color_lower} selection grown in {origin}, ideal for {season} consumption.",
    "Get this {season} favorite for only {price} TL — straight from {origin}.",
    "Enjoy this {weight}g fresh product, delicious in any {season} meal.",
    "Sourced directly from {origin}, this {color_lower} produce costs just {price} TL.",
    "High quality {season} crop from {origin} weighing {weight}g.",
    "Best market price: {price} TL for this {color_lower} origin-certified item."
]


In [None]:
# the real fruits for each class have the measured weights, the other features are list of possible selection for categorical attributes.
FRUIT_CONFIGS = {
    'tomato': {
        'folder_name': 'Tomato',
        'regex_prefix': 'domates',
        'origins': ['Antalya', 'Bursa', 'Manisa', 'İzmir','Mersin'],
        'seasons': ['Summer','Spring'],
        'price_range': (35.0, 80.0),
        'default_color': 'Red',
        'weights': {
            '1': 145, '2': 152, '3': 160, '4': 138, '5': 170, '6': 155,
            '7': 149, '8': 157, '9': 165, '10': 142, '11': 168, '12': 151,'13':
            150, '14': 163, '15': 155, '16': 140, '17': 155, '18': 162
        }
    },
    'tangerine': {
        'folder_name': 'Tangerine',
        'regex_prefix': 'mandalina',
        'origins': ['İzmir', 'Adana', 'Hatay', 'Mersin', 'Antalya', 'Rize'],
        'seasons': ['Winter'],
        'price_range': (30.0, 50.0),
        'default_color': 'Orange',
        'special_colors': {'Green': ['4', '5', '7', '8', '19', '21']},
        'weights': {
            '1': 61, '2': 67, '3': 79, '4': 135, '5': 76, '6': 129,
            '7': 126, '8': 108, '9': 99, '10': 84, '11': 113, '12': 58,
            '13': 70, '14': 63, '15': 55, '16': 80, '17': 105, '18': 76,
            '19': 83, '20': 84, '21': 87
        }
    },
    'banana': {
        'folder_name': 'Banana',
        'regex_prefix': 'muz',
        'origins': ['Antalya', 'Mersin', 'Anamur','Adana','Hatay','Mugla'],
        'seasons': ['Winter', 'Spring'],
        'price_range': (120.0, 200.0),
        'default_color': 'Yellow',
        'weights': {
            '1': 174, '2': 148, '3': 170, '4': 173, '5': 176, '6': 158,
            '7': 152, '8': 157, '9': 176, '10': 151, '11': 148
        }
    },
    'apple': {
        'folder_name': 'Apple',
        'regex_prefix': 'elma',
        'origins': ['Amasya', 'Isparta', 'Karaman', 'Niğde'],
        'seasons': ['Autumn', 'Winter'],
        'price_range': (40.0, 90.0),
        'default_color': 'Red',
        'special_colors': {
            'Yellow': ['1', '3', '6'],
            'Green' : ['2', '7']},
        'weights': { '1': 169, '2': 199, '3': 169, '4': 162, '5': 194, '6':148 , '7':165, '8':209 , '9':164  }
    },
    'orange': {
        'folder_name': 'Orange',
        'regex_prefix': 'portakal',
        'origins': ['Antalya', 'Mersin', 'Adana', 'Dortyol'],
        'seasons': ['Winter'],
        'price_range': (25.0, 60.0),
        'default_color': 'Orange',
        'weights': { '1': 213, '2': 192, '3': 216, '4': 216, '5': 241 ,'6':125 ,'7': 209, '8':182, '9':243 }
    }
}

In [None]:
def get_stats_from_real_weights(weights_dict):
    if not weights_dict:
        return 150.0, 20.0
    vals = list(weights_dict.values())
    if len(vals) < 2:
        return vals[0], 5.0
    return statistics.mean(vals), statistics.stdev(vals)

In [None]:
# this function takes the real dataset, and create the feature set of them
def generate_real_dataset():
    print("\nGENERATING REAL DATASET")
    real_data = []
    for fruit_key, config in FRUIT_CONFIGS.items():
        fruit_folder = os.path.join(IMAGES_ROOT, config['folder_name'], 'Real')
        if not os.path.isdir(fruit_folder):
            print(f"  Skipping {fruit_key}: 'Real' folder not found at {fruit_folder}")
            continue
        image_files = [
            f for f in os.listdir(fruit_folder)
            if (
                '.png' in f.lower() or
                '.jpg' in f.lower() or
                '.jpeg' in f.lower() or
                '.heic' in f.lower()
            )
        ]
        print(f"  Processing {fruit_key}: {len(image_files)} real images found.")

        for filename in image_files:
            #Exact Weight Logic
            weight_float = 0.0
            pattern = rf"{config['regex_prefix']}(\d+)_"
            match = re.search(pattern, filename.lower())

            # Default mean
            fallback_mean, _ = get_stats_from_real_weights(config['weights'])

            if match:
                img_id = match.group(1)
                # EXACT WEIGHT LOOKUP
                weight_float = float(config['weights'].get(img_id, fallback_mean))
            else:
                weight_float = fallback_mean

            # Attributes (Uniform Random)
            season = random.choice(config['seasons'])
            origin = random.choice(config['origins'])
            price_float = random.uniform(config['price_range'][0], config['price_range'][1])
            #Color Logic (Specific if ID matches, else default)
            color = config['default_color']
            if match:
                img_id = match.group(1)
                if 'special_colors' in config:
                    for special_col, ids in config['special_colors'].items():
                        if img_id in ids:
                            color = special_col
            # Formatting
            weight_str = "{:.2f}".format(weight_float).replace('.', ',')
            price_str = "{:.2f}".format(price_float).replace('.', ',')

            # Text
            template = random.choice(GENERIC_TEMPLATES)
            text = template.format(
                season=season, origin=origin, color_lower=color.lower(),
                price=price_str, weight=weight_str, color=color
            )
            image_path = os.path.join(fruit_folder, filename)

            attributes = {
                 'Label': 'Real', 'Fruit': fruit_key,
                'Weight': weight_str, 'Price': price_str, 'Season': season,
                'Origin': origin, 'Color': color, 'Text': text, 'Image_path': image_path
            }

            real_data.append(attributes)


    return real_data

In [None]:
if not os.path.exists(IMAGES_ROOT):
        print(f"ERROR: Could not find path: {IMAGES_ROOT}")
        print("Please check the 'MY_DRIVE_PATH' variable.")

In [None]:
os.makedirs(DATA_FOLDER, exist_ok=True)

#Generate and Save Real Data
real_rows = generate_real_dataset()
if real_rows:
    df_real = pd.DataFrame(real_rows)
    df_real.to_csv(REAL_CSV_FILE, index=False, sep=';', encoding='utf-8-sig')
    print(f"Saved Real Data to: {REAL_CSV_FILE}")
else:
    print("No Real data found.")


GENERATING REAL DATASET
  Processing tomato: 102 real images found.
  Processing tangerine: 126 real images found.
  Processing banana: 124 real images found.
  Processing apple: 123 real images found.
  Processing orange: 111 real images found.
Saved Real Data to: /content/drive/MyDrive/MLProject/Data google sheet/Real_Fruits_Data.csv


In [None]:
# this function takes the ai dataset, and create the feature set of them
def generate_ai_dataset():
    print("\nGENERATING AI DATASET")
    ai_data = []
    for fruit_key, config in FRUIT_CONFIGS.items():
        fruit_folder = os.path.join(IMAGES_ROOT, config['folder_name'], 'AI Generated')
        if not os.path.isdir(fruit_folder):
            print(f"  Skipping {fruit_key}: 'AI Generated' folder not found at {fruit_folder}")
            continue
        image_files = [f for f in os.listdir(fruit_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.heic'))]
        print(f"  Processing {fruit_key}: {len(image_files)} AI images found.")

        # Calculate Distribution Stats from Real List
        ai_mean, ai_std = get_stats_from_real_weights(config['weights'])

        for filename in image_files:
            #AI Weight Logic (Normal Distribution)
            weight_float = random.gauss(ai_mean, ai_std)
            weight_float = max(weight_float, 20.0)

            #Attributes (Uniform Random)
            season = random.choice(config['seasons'])
            origin = random.choice(config['origins'])
            price_float = random.uniform(config['price_range'][0], config['price_range'][1])
            if filename.count("_") > 1:
              # Name has extra color specifier
              color = (filename.split("_")[2].split(".")[0]).upper()
            else:
              color = config['default_color']

            # Formatting
            weight_str = "{:.2f}".format(weight_float).replace('.', ',')
            price_str = "{:.2f}".format(price_float).replace('.', ',')

            #Text
            template = random.choice(GENERIC_TEMPLATES)
            text = template.format(
                season=season, origin=origin, color_lower=color.lower(),
                price=price_str, weight=weight_str, color=color
            )

            image_path = os.path.join(fruit_folder, filename)

            attributes = {
                 'Label': 'AI', 'Fruit': fruit_key,
                'Weight': weight_str, 'Price': price_str, 'Season': season,
                'Origin': origin, 'Color': color, 'Text': text, 'Image_path': image_path
            }


            ai_data.append(attributes)

    return ai_data



In [None]:
ai_rows = generate_ai_dataset()
if ai_rows:
  df_ai = pd.DataFrame(ai_rows)
  df_ai.to_csv(AI_CSV_FILE, index=False, sep=';', encoding='utf-8-sig')
  print(f"Saved AI Data to: {AI_CSV_FILE}")
else:
  print("No AI data found.")


GENERATING AI DATASET
  Processing tomato: 526 AI images found.
  Processing tangerine: 542 AI images found.
  Processing banana: 532 AI images found.
  Processing apple: 499 AI images found.
  Processing orange: 497 AI images found.
Saved AI Data to: /content/drive/MyDrive/MLProject/Data google sheet/AI_Fruits_Data.csv
