In [10]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from rembg import remove
from skimage.feature import graycomatrix, graycoprops
from sklearn.preprocessing import normalize

# ---------------------- STEP 1: Video Frame Extraction ----------------------

# Define the video path
video_path = r"C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065.mp4"

# Define frames extraction rate
frames_per_second = 5

# Extract video name (without extension)
video_name = os.path.splitext(os.path.basename(video_path))[0]

# Define output directories
video_folder = os.path.dirname(video_path)
frames_folder = os.path.join(video_folder, video_name)
cleaned_video_folder = os.path.join(video_folder, "cleaned", video_name)

# Skip processing if the cleaned folder already exists
if not os.path.exists(cleaned_video_folder):
    os.makedirs(frames_folder, exist_ok=True)

    # Open the video file for frame extraction
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frames_per_second) if fps > 0 else 1

    frame_count = 0
    success, frame = cap.read()
    while success:
        if frame_count % frame_interval == 0:
            frame_file = os.path.join(frames_folder, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_file, frame)
            print(f"Extracted frame: {frame_file}")

        success, frame = cap.read()
        frame_count += 1

    cap.release()

    # Process each extracted frame
    os.makedirs(cleaned_video_folder, exist_ok=True)
    for frame_file in os.listdir(frames_folder):
        if frame_file.endswith(".jpg"):
            frame_path = os.path.join(frames_folder, frame_file)

            img = Image.open(frame_path)

            left = 0
            top = max(0, (img.height - 720) / 2)
            right = 720
            bottom = top + 720

            cropped_img = img.crop((left, top, right, bottom))

            final_image = remove(cropped_img)

            cleaned_frame_path = os.path.join(
                cleaned_video_folder, frame_file.replace(".jpg", "_cleaned.png")
            )
            final_image.save(cleaned_frame_path)
            print(f"Processed frame saved: {cleaned_frame_path}")

# ---------------------- STEP 2: Feature Extraction ----------------------

# Color Feature Extraction
def extract_color_features(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return None

    pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    img_no_bg = remove(pil_img)
    img_no_bg = np.array(img_no_bg)

    rgb_image = cv2.cvtColor(img_no_bg, cv2.COLOR_RGBA2RGB)

    hist_r = cv2.calcHist([rgb_image], [0], None, [256], [0, 256])
    hist_g = cv2.calcHist([rgb_image], [1], None, [256], [0, 256])
    hist_b = cv2.calcHist([rgb_image], [2], None, [256], [0, 256])

    hist_r = normalize(hist_r, axis=0, norm='l1').flatten()
    hist_g = normalize(hist_g, axis=0, norm='l1').flatten()
    hist_b = normalize(hist_b, axis=0, norm='l1').flatten()

    avg_r = np.mean(rgb_image[:, :, 0])
    avg_g = np.mean(rgb_image[:, :, 1])
    avg_b = np.mean(rgb_image[:, :, 2])

    return {
        "Image": os.path.basename(image_path),
        "Avg Red": avg_r,
        "Avg Green": avg_g,
        "Avg Blue": avg_b,
        **{f'R Hist Bin {i}': hist_r[i] for i in range(len(hist_r))},
        **{f'G Hist Bin {i}': hist_g[i] for i in range(len(hist_g))},
        **{f'B Hist Bin {i}': hist_b[i] for i in range(len(hist_b))}
    }

# Cut Feature Extraction
def extract_geometric_features(image_path):
    features = {}
    image = cv2.imread(image_path)
    if image is None:
        print(f"Warning: Unable to read image {image_path}. Skipping.")
        return None

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blurred, threshold1=50, threshold2=150)
    contours, _ = cv2.findContours(edges.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        return None

    contour = max(contours, key=cv2.contourArea)

    x, y, w, h = cv2.boundingRect(contour)
    aspect_ratio = float(w) / h if h != 0 else 0
    perimeter = cv2.arcLength(contour, True)
    area = cv2.contourArea(contour)
    circularity = (4 * np.pi * area) / (perimeter ** 2) if perimeter != 0 else 0

    hull = cv2.convexHull(contour)
    hull_area = cv2.contourArea(hull)
    convexity = area / hull_area if hull_area != 0 else 0
    edge_sharpness = cv2.Laplacian(gray, cv2.CV_64F).var()

    flipped_horizontal = cv2.flip(gray, 1)
    symmetry_horizontal = cv2.absdiff(gray, flipped_horizontal)
    horizontal_symmetry_score = 1 - (np.mean(symmetry_horizontal) / 255)

    flipped_vertical = cv2.flip(gray, 0)
    symmetry_vertical = cv2.absdiff(gray, flipped_vertical)
    vertical_symmetry_score = 1 - (np.mean(symmetry_vertical) / 255)

    symmetry = (horizontal_symmetry_score + vertical_symmetry_score) / 2

    features['Image'] = os.path.basename(image_path)
    features['Aspect_Ratio'] = aspect_ratio
    features['Perimeter'] = perimeter
    features['Area'] = area
    features['Circularity'] = circularity
    features['Convexity'] = convexity
    features['Edge_Sharpness'] = edge_sharpness
    features['Symmetry'] = symmetry

    return features

# Clarity Feature Extraction
def extract_clarity_features(image_path):
    features = {}
    image = cv2.imread(image_path)
    if image is None:
        return None

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    glcm = graycomatrix(gray, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    features['Contrast'] = graycoprops(glcm, 'contrast')[0, 0]
    features['Homogeneity'] = graycoprops(glcm, 'homogeneity')[0, 0]
    features['Energy'] = graycoprops(glcm, 'energy')[0, 0]
    features['Correlation'] = graycoprops(glcm, 'correlation')[0, 0]

    edges = cv2.Canny(gray, threshold1=50, threshold2=150)
    features['Edge_Density'] = np.sum(edges) / (gray.shape[0] * gray.shape[1])
    features['Intensity_Variance'] = np.var(gray)

    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hue_std = np.std(hsv[:, :, 0])
    saturation_std = np.std(hsv[:, :, 1])
    features['Hue_Std'] = hue_std
    features['Saturation_Std'] = saturation_std

    features['Image'] = os.path.basename(image_path)


    return features

# Process Images and Save Features
def process_images(input_folder, output_folder):
    extracted_features = []  # Store feature dictionaries

    for filename in os.listdir(input_folder):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(input_folder, filename)

            # Extract features
            color_features = extract_color_features(image_path) or {}
            cut_features = extract_geometric_features(image_path) or {}
            clarity_features = extract_clarity_features(image_path) or {}

            # Combine all extracted features into a single row
            combined_features = {**color_features, **cut_features, **clarity_features}

            # Append to the list
            extracted_features.append(combined_features)

    # Convert to DataFrame
    df = pd.DataFrame(extracted_features)

    # Ensure output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Save to CSV
    output_file = os.path.join(output_folder, 'combined_features.csv')
    df.to_csv(output_file, index=False)
    print(f"Features saved to {output_file}")

# Run processing
process_images(cleaned_video_folder, os.path.join(cleaned_video_folder, "feature_extraction"))

import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# ---------------------- STEP 1: Apply PCA Transformation ----------------------

# Load the extracted feature dataset
feature_file = os.path.join(cleaned_video_folder, "feature_extraction", "combined_features.csv")
data = pd.read_csv(feature_file)

# Step 1: Remove rows with NaN values (if any)
data_clean = data.dropna()

# Step 2: Drop the 'Image' column if it exists (since it's non-numeric)
if 'Image' in data_clean.columns:
    features = data_clean.drop(columns=['Image'])
else:
    features = data_clean.copy()

# Step 3: Normalize the features (Standardization)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Step 4: Apply PCA to retain 30 components
pca = PCA(n_components=30)
features_pca = pca.fit_transform(features_scaled)

# Convert PCA results to a DataFrame with column names (0 to 29)
pca_df = pd.DataFrame(features_pca, columns=[str(i) for i in range(30)])

# Save PCA results
pca_output_path = os.path.join(cleaned_video_folder, "feature_extraction", "pca_results.csv")
pca_df.to_csv(pca_output_path, index=False)
print(f"PCA results saved at {pca_output_path}")

# ---------------------- STEP 2: Reformat Dataset and Place PCA Columns at CO to DR ----------------------

# Reload the original dataset (without PCA transformation)
data = pd.read_csv(feature_file)

# Drop PCA columns if they exist in the original dataset
pca_columns = [str(i) for i in range(30)]
data = data.drop(columns=pca_columns, errors='ignore')

# Remove all columns before CO (keeping placeholders for structure)
columns_to_keep = list(data.columns)
if "CO" in columns_to_keep:
    co_index = columns_to_keep.index("CO")
    data = data.iloc[:, co_index:]  # Keep everything from CO onwards
else:
    data = pd.DataFrame()  # If CO is missing, reset to empty DataFrame

# Remove all columns after DR
if "DR" in data.columns:
    dr_index = data.columns.get_loc("DR") + 1
    data = data.iloc[:, :dr_index]

# Ensure there are at least 92 columns before appending PCA data
while len(data.columns) < 92:
    data[f'Placeholder_{len(data.columns)}'] = None  # Add empty placeholders if needed

# Append PCA columns at CO to DR (92nd column onward)
data_final = pd.concat([data, pca_df], axis=1)  # Efficiently insert PCA columns

# Save the final structured dataset
pca_updated_path = os.path.join(cleaned_video_folder, "feature_extraction", "pca_updated.csv")
data_final.to_csv(pca_updated_path, index=False)
print(f"PCA results moved to columns CO to DR, relabeled to 0-29, and saved at {pca_updated_path}")


import os
import pandas as pd
import joblib

# ---------------------- STEP 3: Load and Predict with Trained Model ----------------------

# Define file paths
pca_updated_path = os.path.join(cleaned_video_folder, "feature_extraction", "pca_updated.csv")
model_file = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\rf_model.joblib'
training_data_csv = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_encoded.csv'
output_csv = os.path.join(cleaned_video_folder, "feature_extraction", "predicted_targets.csv")

# Step 1: Load the trained model
model = joblib.load(model_file)

# Step 2: Load the PCA-updated dataset for prediction
data = pd.read_csv(pca_updated_path)

# Step 3: Load the training dataset to extract correct target column names
training_data = pd.read_csv(training_data_csv)

# Extract target column names (from index 1 to 92, representing B to CN)
target_columns = training_data.columns[1:92].tolist()

# Step 4: Select only the 30 PCA features used for training
expected_feature_count = 30  # Ensure we only have 30 features
data = data.iloc[:, -expected_feature_count:]  # Keep only the last 30 columns

# Step 5: Make predictions using the trained model
predictions = model.predict(data)

# Step 6: Convert predictions to DataFrame with correct target column names
if predictions.shape[1] == len(target_columns):
    predicted_df = pd.DataFrame(predictions, columns=target_columns)
else:
    print("Warning: Number of predicted columns does not match expected column names. Adjusting accordingly.")
    predicted_df = pd.DataFrame(predictions[:, :len(target_columns)], columns=target_columns[:predictions.shape[1]])

# Step 7: Save predictions to a new CSV file
predicted_df.to_csv(output_csv, index=False)

print(f"Predicted target labels saved to {output_csv}")


import pandas as pd

# ---------------------- STEP 4: Extract Final Top Targets ----------------------

# Define file paths
predicted_csv = os.path.join(cleaned_video_folder, "feature_extraction", "predicted_targets.csv")
final_output_csv = os.path.join(cleaned_video_folder, "feature_extraction", "final_top_targets.csv")

# Step 1: Load the predicted target dataset
data = pd.read_csv(predicted_csv)

# Step 2: Define categories and their respective prefixes
categories = {
    "Color": "Color_",  
    "Shape": "Shape_",  
    "Cut": "Cut_",  
    "Clarity": "Clarity_",  
    "Color Intensity": "Color Intensity_"  
}

# Step 3: Find the most frequent `1` column in each category
selected_values = {}
for category, prefix in categories.items():
    category_columns = [col for col in data.columns if col.startswith(prefix)]  # Get columns matching the category
    if category_columns:
        ones_count = data[category_columns].eq(1).sum()  # Count occurrences of `1`
        if ones_count.max() > 0:  # Ensure there's at least one occurrence of `1`
            top_column = ones_count.idxmax()  # Get the column with most `1`s
            selected_values[category] = top_column.replace(prefix, "")  # Remove prefix to retain only attribute name
        else:
            selected_values[category] = "Unknown"  # Default if no `1` exists in the category

# Step 4: Create a single-row DataFrame with the selected attribute values
final_data = pd.DataFrame([selected_values])

# Step 5: Save the result to a new CSV file
final_data.to_csv(final_output_csv, index=False)

print(f"Final top target attributes saved in {final_output_csv}")



Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_0.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_4.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_8.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_12.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_16.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_20.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_24.jpg
Extracted frame: C:\Users\Muralish\Desktop\DSGP\GemAppraisal-DSGP\notebook\Notebook-Norman\Test_Video\S19065\frame_28.jpg
Extracted frame: C:\Users\M