In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load splited data
train_path = "/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/split_data/train_data.parquet"
val_path = '/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/split_data/val_data.parquet'
test_path = '/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/split_data/test_data.parquet'

train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)
test_df = pd.read_parquet(test_path)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from skimage.feature import hog
from skimage import io, color, transform
from sklearn.decomposition import PCA
import cv2
from tqdm import tqdm
import joblib
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

def extract_image_features(image_path):
    """Extract HOG and color features from a single image"""
    try:
        # Read the image
        img = cv2.imread(image_path)
        if img is None:
            return None

        # Convert to RGB (OpenCV uses BGR by default)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # 1. HOG features
        img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        hog_features = hog(
            img_gray,
            orientations=9,
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2),
            visualize=False
        )

        # 2. Color Histogram
        color_features = []
        for i in range(3):  # RGB channels
            hist = cv2.calcHist([img], [i], None, [32], [0, 256])
            color_features.extend(hist.flatten())

        # Combine all image features
        return np.concatenate([hog_features, np.array(color_features)])

    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None


def preprocess_for_tree_ensemble(train_df, val_df, test_df,
                                 text_column='text_for_tfidf',
                                 image_column='processed_image_path',
                                 output_dir='tree_ensemble_features',
                                 n_jobs=-1):
    """
    Extract and prepare features for decision tree ensemble model.

    Parameters:
    train_df, val_df, test_df: Splitted datasets
    text_column: Name of the column containing preprocessed text
    image_column: Name of the column containing image paths
    output_dir: Directory to save extracted features
    n_jobs: Number of jobs for parallel processing

    Returns:
    dict: Dictionary containing all features and vectorizers
    """
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()

    os.makedirs(output_dir, exist_ok=True)

    print("Preparing features for decision tree ensemble model...")

    # ==== 1. Text Feature Extraction ====
    print("\n1. Text Feature Extraction - TF-IDF")

    # Create TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,  # Limit the number of features to avoid dimensional explosion
        min_df=5,           # Appear in at least 5 documents
        max_df=0.85,        # Appear in no more than 85% of documents
        ngram_range=(1, 2)  # Use 1-gram and 2-gram
    )

    # Fit and transform on the training set
    print("  Fitting TF-IDF vectorizer...")
    X_text_train = tfidf_vectorizer.fit_transform(train_df[text_column])

    # Transform validation and test sets
    X_text_val = tfidf_vectorizer.transform(val_df[text_column])
    X_text_test = tfidf_vectorizer.transform(test_df[text_column])

    print(f"  Text feature shape: {X_text_train.shape[1]} dimensions")

    # Save the vectorizer
    joblib.dump(tfidf_vectorizer, os.path.join(output_dir, 'tfidf_vectorizer.pkl'))

    # ==== 2. Image Feature Extraction ====
    print("\n2. Image Feature Extraction - HOG and Color Histogram")

    # Parallel processing of image feature extraction
    def process_dataset_images(df):
        features = []
        valid_indices = []

        image_paths = df[image_column].tolist()

        with ProcessPoolExecutor(max_workers=n_jobs) as executor:
            futures = {executor.submit(extract_image_features, path): i
                      for i, path in enumerate(image_paths)}

            with tqdm(total=len(futures), desc="Extracting Image Features") as pbar:
                for future in as_completed(futures):
                    idx = futures[future]
                    result = future.result()
                    if result is not None:
                        features.append(result)
                        valid_indices.append(idx)
                    pbar.update(1)

        # Convert to numpy array
        if features:
            features_array = np.vstack(features)
            return features_array, valid_indices
        else:
            return np.array([]), []

    # Extract training set image features
    print("  Extracting training set image features...")
    X_img_train, train_valid_indices = process_dataset_images(train_df)

    # If valid image features exist
    if len(X_img_train) > 0:
        # Keep only valid image samples
        train_df_valid = train_df.iloc[train_valid_indices].reset_index(drop=True)
        X_text_train_valid = X_text_train[train_valid_indices]

        # Apply PCA on image features
        print("  Applying PCA on image features...")
        pca = PCA(n_components=min(300, X_img_train.shape[1], X_img_train.shape[0]),
                 random_state=42)
        X_img_train_pca = pca.fit_transform(X_img_train)
        joblib.dump(pca, os.path.join(output_dir, 'image_pca.pkl'))

        # Extract image features for validation and test sets
        print("  Extracting validation set image features...")
        X_img_val, val_valid_indices = process_dataset_images(val_df)
        if len(X_img_val) > 0:
            X_img_val_pca = pca.transform(X_img_val)
            val_df_valid = val_df.iloc[val_valid_indices].reset_index(drop=True)
            X_text_val_valid = X_text_val[val_valid_indices]
        else:
            X_img_val_pca = np.array([])
            val_df_valid = pd.DataFrame()
            X_text_val_valid = None

        print("  Extracting test set image features...")
        X_img_test, test_valid_indices = process_dataset_images(test_df)
        if len(X_img_test) > 0:
            X_img_test_pca = pca.transform(X_img_test)
            test_df_valid = test_df.iloc[test_valid_indices].reset_index(drop=True)
            X_text_test_valid = X_text_test[test_valid_indices]
        else:
            X_img_test_pca = np.array([])
            test_df_valid = pd.DataFrame()
            X_text_test_valid = None

        # ==== 3. Feature Fusion ====
        print("\n3. Fusion of Text and Image Features")

        # Convert sparse matrix to dense matrix
        X_text_train_dense = X_text_train_valid.toarray()

        # Fusion of features (simple concatenation)
        X_train_combined = np.hstack([X_text_train_dense, X_img_train_pca])

        if len(X_img_val) > 0:
            X_text_val_dense = X_text_val_valid.toarray()
            X_val_combined = np.hstack([X_text_val_dense, X_img_val_pca])
        else:
            X_val_combined = np.array([])

        if len(X_img_test) > 0:
            X_text_test_dense = X_text_test_valid.toarray()
            X_test_combined = np.hstack([X_text_test_dense, X_img_test_pca])
        else:
            X_test_combined = np.array([])

        print(f"  Final feature dimensions: {X_train_combined.shape[1]}")

        # Get labels
        y_train = train_df_valid['is_match'].values

        if not val_df_valid.empty:
            y_val = val_df_valid['is_match'].values
        else:
            y_val = np.array([])

        if not test_df_valid.empty:
            y_test = test_df_valid['is_match'].values
        else:
            y_test = np.array([])

        # Save features and labels
        print("\nSaving processed features...")
        np.save(os.path.join(output_dir, 'X_train.npy'), X_train_combined)
        np.save(os.path.join(output_dir, 'y_train.npy'), y_train)

        if len(X_val_combined) > 0:
            np.save(os.path.join(output_dir, 'X_val.npy'), X_val_combined)
            np.save(os.path.join(output_dir, 'y_val.npy'), y_val)

        if len(X_test_combined) > 0:
            np.save(os.path.join(output_dir, 'X_test.npy'), X_test_combined)
            np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

        # Save valid sample information
        train_df_valid.to_csv(os.path.join(output_dir, 'train_df_valid.csv'), index=False)
        if not val_df_valid.empty:
            val_df_valid.to_csv(os.path.join(output_dir, 'val_df_valid.csv'), index=False)
        if not test_df_valid.empty:
            test_df_valid.to_csv(os.path.join(output_dir, 'test_df_valid.csv'), index=False)

        print("\nDecision tree ensemble model feature preparation completed!")

        return {
            'X_train': X_train_combined,
            'y_train': y_train,
            'X_val': X_val_combined if len(X_val_combined) > 0 else None,
            'y_val': y_val if len(y_val) > 0 else None,
            'X_test': X_test_combined if len(X_test_combined) > 0 else None,
            'y_test': y_test if len(y_test) > 0 else None,
            'train_df': train_df_valid,
            'val_df': val_df_valid if not val_df_valid.empty else None,
            'test_df': test_df_valid if not test_df_valid.empty else None,
            'tfidf_vectorizer': tfidf_vectorizer,
            'pca': pca
        }
    else:
        print("Error: Unable to extract valid image features from the training set")
        return None


tree_ensemble_features = preprocess_for_tree_ensemble(train_df, val_df, test_df,
                                       output_dir='/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/tree_ensemble_features')

In [None]:
import numpy as np
import pandas as pd
import joblib
import os

def load_tree_ensemble_features(output_dir):
    """
    Load previously saved feature files from the specified directory and reconstruct the tree_ensemble_features dictionary.

    Parameters:
    output_dir: Directory where the feature files were previously saved.

    Returns:
    dict: A dictionary containing all the features and vectorizers.
    """
    print(f"Loading feature files from {output_dir}...")

    # Create result dictionary
    tree_ensemble_features = {}

    # 1. Load the TF-IDF vectorizer
    tfidf_path = os.path.join(output_dir, 'tfidf_vectorizer.pkl')
    if os.path.exists(tfidf_path):
        print("Loading TF-IDF vectorizer...")
        tfidf_vectorizer = joblib.load(tfidf_path)
        tree_ensemble_features['tfidf_vectorizer'] = tfidf_vectorizer
    else:
        print(f"Warning: TF-IDF vectorizer file not found at {tfidf_path}")

    # 2. Load the image PCA model
    pca_path = os.path.join(output_dir, 'image_pca.pkl')
    if os.path.exists(pca_path):
        print("Loading image PCA model...")
        pca = joblib.load(pca_path)
        tree_ensemble_features['pca'] = pca
    else:
        print(f"Warning: PCA model file not found at {pca_path}")

    # 3. Load training features and labels
    X_train_path = os.path.join(output_dir, 'X_train.npy')
    y_train_path = os.path.join(output_dir, 'y_train.npy')

    if os.path.exists(X_train_path) and os.path.exists(y_train_path):
        print("Loading training features and labels...")
        tree_ensemble_features['X_train'] = np.load(X_train_path)
        tree_ensemble_features['y_train'] = np.load(y_train_path)
        print(f"  Training feature shape: {tree_ensemble_features['X_train'].shape}")
    else:
        print(f"Warning: Training features or labels file not found")

    # 4. Load validation features and labels
    X_val_path = os.path.join(output_dir, 'X_val.npy')
    y_val_path = os.path.join(output_dir, 'y_val.npy')

    if os.path.exists(X_val_path) and os.path.exists(y_val_path):
        print("Loading validation features and labels...")
        tree_ensemble_features['X_val'] = np.load(X_val_path)
        tree_ensemble_features['y_val'] = np.load(y_val_path)
        print(f"  Validation feature shape: {tree_ensemble_features['X_val'].shape}")
    else:
        print(f"Note: Validation features or labels file not found, setting as None")
        tree_ensemble_features['X_val'] = None
        tree_ensemble_features['y_val'] = None

    # 5. Load test features and labels
    X_test_path = os.path.join(output_dir, 'X_test.npy')
    y_test_path = os.path.join(output_dir, 'y_test.npy')

    if os.path.exists(X_test_path) and os.path.exists(y_test_path):
        print("Loading test features and labels...")
        tree_ensemble_features['X_test'] = np.load(X_test_path)
        tree_ensemble_features['y_test'] = np.load(y_test_path)
        print(f"  Test feature shape: {tree_ensemble_features['X_test'].shape}")
    else:
        print(f"Note: Test features or labels file not found, setting as None")
        tree_ensemble_features['X_test'] = None
        tree_ensemble_features['y_test'] = None

    # 6. Load dataframes
    train_df_path = os.path.join(output_dir, 'train_df_valid.csv')
    val_df_path = os.path.join(output_dir, 'val_df_valid.csv')
    test_df_path = os.path.join(output_dir, 'test_df_valid.csv')

    if os.path.exists(train_df_path):
        print("Loading training dataframe...")
        tree_ensemble_features['train_df'] = pd.read_csv(train_df_path)
    else:
        print(f"Warning: Training dataframe file not found at {train_df_path}")
        tree_ensemble_features['train_df'] = None

    if os.path.exists(val_df_path):
        print("Loading validation dataframe...")
        tree_ensemble_features['val_df'] = pd.read_csv(val_df_path)
    else:
        print(f"Note: Validation dataframe file not found, setting as None")
        tree_ensemble_features['val_df'] = None

    if os.path.exists(test_df_path):
        print("Loading test dataframe...")
        tree_ensemble_features['test_df'] = pd.read_csv(test_df_path)
    else:
        print(f"Note: Test dataframe file not found, setting as None")
        tree_ensemble_features['test_df'] = None

    print("\nLoading complete!")
    return tree_ensemble_features

tree_ensemble_data = load_tree_ensemble_features(output_dir='/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/tree_ensemble_features')

Loading feature files from /content/drive/MyDrive/amazon_meta_data/tree_ensemble_features...
Loading TF-IDF vectorizer...
Loading image PCA model...
Loading training features and labels...
  Training feature shape: (35923, 5300)
Loading validation features and labels...
  Validation feature shape: (5132, 5300)
Loading test features and labels...
  Test feature shape: (10265, 5300)
Loading training dataframe...
Loading validation dataframe...
Loading test dataframe...

Loading complete!
