In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_MOUNTED = True
    print("‚úÖ Google Drive mounted!")
except:
    DRIVE_MOUNTED = False
    print("‚ÑπÔ∏è Google Drive not available")

Mounted at /content/drive
‚úÖ Google Drive mounted!


In [2]:
! pip install openai-clip

In [None]:
! pip install pytesseract

In [1]:
# ====================================================================
# GREENTEXT CLIP PREPROCESSING PIPELINE - FIXED VERSION
# Purpose: Preprocess scraped greentext data for CLIP model training
# ====================================================================

import pandas as pd
import numpy as np
import torch
import clip
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import pytesseract
import re
import json
import os
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

# Install required packages
import subprocess
import sys

def install_packages():
    packages = [
        'opencv-python', 'pytesseract', 'transformers',
        'torch', 'torchvision', 'ftfy', 'regex', 'tqdm',
        'scikit-learn', 'matplotlib', 'seaborn'
    ]
    for package in packages:
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])
        except:
            print(f"‚ö†Ô∏è Could not install {package}")

    # Install CLIP
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/openai/CLIP.git', '-q'])
        print("‚úÖ CLIP installed successfully!")
    except:
        print("‚ùå Failed to install CLIP")

print("üì¶ Installing required packages...")
install_packages()

class GreentextCLIPPreprocessor:
    """Complete preprocessing pipeline for greentext images and CLIP training"""

    def __init__(self, dataset_path: str, images_path: str):
        """
        Initialize the preprocessor

        Args:
            dataset_path: Path to the CSV file with scraped data
            images_path: Path to the directory containing downloaded images
        """
        self.dataset_path = Path(dataset_path)
        self.images_path = Path(images_path)
        self.processed_path = self.dataset_path.parent / "processed"
        self.processed_path.mkdir(exist_ok=True)

        # Load dataset
        self.df = pd.read_csv(dataset_path)
        print(f"üìä Loaded dataset with {len(self.df)} records")

        # Initialize CLIP model for preprocessing
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"üñ•Ô∏è Using device: {self.device}")

        try:
            self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=self.device)
            print("‚úÖ CLIP model loaded successfully")
        except Exception as e:
            print(f"‚ùå Failed to load CLIP: {e}")
            self.clip_model = None

    def analyze_dataset_quality(self):
        """Analyze the quality and characteristics of the scraped dataset"""
        print("\nüìä DATASET QUALITY ANALYSIS")
        print("=" * 50)

        # Basic statistics
        successful_downloads = self.df[self.df['download_success'] == True]
        print(f"‚úÖ Successfully downloaded images: {len(successful_downloads)}")
        print(f"‚ùå Failed downloads: {len(self.df) - len(successful_downloads)}")

        if len(successful_downloads) == 0:
            print("‚ö†Ô∏è No successful downloads found!")
            return None

        # Score distribution
        print(f"\nüìà Score Statistics:")
        print(f"   Average score: {successful_downloads['score'].mean():.1f}")
        print(f"   Median score: {successful_downloads['score'].median():.1f}")
        print(f"   Score range: {successful_downloads['score'].min()} - {successful_downloads['score'].max()}")

        # Image dimensions analysis
        if 'dimensions' in successful_downloads.columns:
            dims = successful_downloads['dimensions'].apply(eval)  # Convert string to tuple
            widths = dims.apply(lambda x: x[0])
            heights = dims.apply(lambda x: x[1])

            print(f"\nüñºÔ∏è Image Dimensions:")
            print(f"   Average width: {widths.mean():.0f}px")
            print(f"   Average height: {heights.mean():.0f}px")
            print(f"   Width range: {widths.min()} - {widths.max()}")
            print(f"   Height range: {heights.min()} - {heights.max()}")

        # Title analysis
        title_lengths = successful_downloads['title'].str.len()
        print(f"\nüìù Title Analysis:")
        print(f"   Average title length: {title_lengths.mean():.1f} characters")
        print(f"   Title length range: {title_lengths.min()} - {title_lengths.max()}")

        return successful_downloads

    def extract_text_from_image(self, image_path: str) -> Dict[str, str]:
        """
        Extract text from greentext image using OCR

        Args:
            image_path: Path to the image file

        Returns:
            Dictionary containing extracted text and metadata
        """
        try:
            # Load image
            image = cv2.imread(str(image_path))
            if image is None:
                return {"text": "", "confidence": 0, "error": "Could not load image"}

            # Preprocess image for better OCR
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # Enhance contrast
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            enhanced = clahe.apply(gray)

            # Apply gaussian blur to reduce noise
            blurred = cv2.GaussianBlur(enhanced, (1, 1), 0)

            # Threshold to get binary image
            _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

            # OCR configuration for greentext
            custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,!?;:()[]{}"\'-/\n >'

            # Extract text
            text = pytesseract.image_to_string(thresh, config=custom_config)

            # Get confidence scores
            try:
                data = pytesseract.image_to_data(thresh, output_type=pytesseract.Output.DICT)
                confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
                avg_confidence = np.mean(confidences) if confidences else 0
            except:
                avg_confidence = 0

            # Clean extracted text
            cleaned_text = self.clean_extracted_text(text)

            return {
                "text": cleaned_text,
                "raw_text": text,
                "confidence": avg_confidence,
                "text_length": len(cleaned_text),
                "error": None
            }

        except Exception as e:
            return {"text": "", "confidence": 0, "error": str(e)}

    def clean_extracted_text(self, text: str) -> str:
        """Clean and normalize extracted text from greentext images"""
        if not text:
            return ""

        # Remove extra whitespace and newlines
        text = re.sub(r'\s+', ' ', text)

        # Remove common OCR artifacts
        text = re.sub(r'[^\w\s.,!?;:()\[\]{}"\'>/\-]', '', text)

        # Fix common greentext patterns
        text = re.sub(r'>\s*([a-zA-Z])', r'> \1', text)  # Fix greentext arrows
        text = re.sub(r'be\s+me', 'be me', text, flags=re.IGNORECASE)  # Fix common phrase

        # Remove lines that are too short (likely OCR errors)
        lines = text.split('\n')
        meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3]

        return '\n'.join(meaningful_lines).strip()

    def create_text_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Create CLIP text embeddings for the extracted text"""
        if not self.clip_model:
            print("‚ùå CLIP model not available")
            return np.array([])

        all_embeddings = []

        print("üîÑ Creating text embeddings...")
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]

            # Tokenize and encode
            try:
                text_tokens = clip.tokenize(batch_texts, truncate=True).to(self.device)

                with torch.no_grad():
                    text_embeddings = self.clip_model.encode_text(text_tokens)
                    text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)

                all_embeddings.append(text_embeddings.cpu().numpy())

            except Exception as e:
                print(f"‚ùå Error processing batch {i//batch_size}: {e}")
                # Add zero embeddings for failed batch
                zero_embeddings = np.zeros((len(batch_texts), 512))
                all_embeddings.append(zero_embeddings)

        return np.vstack(all_embeddings)

    def create_image_embeddings(self, image_paths: List[str], batch_size: int = 16) -> np.ndarray:
        """Create CLIP image embeddings for the greentext images"""
        if not self.clip_model:
            print("‚ùå CLIP model not available")
            return np.array([])

        all_embeddings = []

        print("üîÑ Creating image embeddings...")
        for i in tqdm(range(0, len(image_paths), batch_size)):
            batch_paths = image_paths[i:i+batch_size]
            batch_images = []

            # Load and preprocess images
            for path in batch_paths:
                try:
                    image = Image.open(path).convert('RGB')
                    processed_image = self.clip_preprocess(image)
                    batch_images.append(processed_image)
                except Exception as e:
                    print(f"‚ùå Error loading image {path}: {e}")
                    # Add a blank image
                    blank_image = Image.new('RGB', (224, 224), color='white')
                    processed_image = self.clip_preprocess(blank_image)
                    batch_images.append(processed_image)

            # Create embeddings
            try:
                image_batch = torch.stack(batch_images).to(self.device)

                with torch.no_grad():
                    image_embeddings = self.clip_model.encode_image(image_batch)
                    image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)

                all_embeddings.append(image_embeddings.cpu().numpy())

            except Exception as e:
                print(f"‚ùå Error processing image batch {i//batch_size}: {e}")
                # Add zero embeddings for failed batch
                zero_embeddings = np.zeros((len(batch_paths), 512))
                all_embeddings.append(zero_embeddings)

        return np.vstack(all_embeddings)

    def enhance_titles_with_context(self, df: pd.DataFrame) -> pd.DataFrame:
        """Enhance titles with additional context for better search"""
        df = df.copy()

        # Create enhanced descriptions
        enhanced_descriptions = []

        for _, row in df.iterrows():
            description_parts = []

            # Add title
            if pd.notna(row['title']):
                description_parts.append(row['title'])

            # Add score context
            if row['score'] > 1000:
                description_parts.append("popular greentext")
            elif row['score'] > 500:
                description_parts.append("well-liked greentext")

            # Add engagement context
            if row['engagement_ratio'] > 0.1:
                description_parts.append("highly discussed")

            # Add extracted text if available
            if 'extracted_text' in row and pd.notna(row['extracted_text']) and row['extracted_text']:
                # Add first few words of extracted text
                text_preview = row['extracted_text'][:200] + "..." if len(row['extracted_text']) > 200 else row['extracted_text']
                description_parts.append(text_preview)

            enhanced_descriptions.append(" | ".join(description_parts))

        df['enhanced_description'] = enhanced_descriptions
        return df

    def create_search_index(self, df: pd.DataFrame, text_embeddings: np.ndarray,
                          image_embeddings: np.ndarray) -> Dict:
        """Create a search index for fast similarity search"""

        # Combine text and image embeddings (weighted average)
        text_weight = 0.3
        image_weight = 0.7

        combined_embeddings = (text_weight * text_embeddings +
                             image_weight * image_embeddings)

        # Normalize combined embeddings
        combined_embeddings = combined_embeddings / np.linalg.norm(combined_embeddings, axis=1, keepdims=True)

        search_index = {
            'embeddings': combined_embeddings,
            'text_embeddings': text_embeddings,
            'image_embeddings': image_embeddings,
            'metadata': df[['post_id', 'title', 'score', 'filename', 'enhanced_description']].to_dict('records'),
            'text_weight': text_weight,
            'image_weight': image_weight
        }

        return search_index

    def run_complete_preprocessing(self):
        """Run the complete preprocessing pipeline"""
        print("üöÄ STARTING GREENTEXT CLIP PREPROCESSING PIPELINE")
        print("=" * 60)

        # Step 1: Analyze dataset quality
        valid_df = self.analyze_dataset_quality()
        if valid_df is None or len(valid_df) == 0:
            print("‚ùå No valid data to process")
            return None

        # Make a copy to avoid modifying original
        valid_df = valid_df.copy()

        # Step 2: Extract text from images
        print("\nüîç EXTRACTING TEXT FROM IMAGES")
        print("-" * 40)

        text_data = []
        image_paths = []

        for _, row in tqdm(valid_df.iterrows(), total=len(valid_df), desc="Processing images"):
            image_path = self.images_path / row['filename']

            if image_path.exists():
                # Extract text
                text_result = self.extract_text_from_image(image_path)
                text_data.append(text_result)
                image_paths.append(str(image_path))
            else:
                print(f"‚ö†Ô∏è Image not found: {image_path}")
                text_data.append({"text": "", "confidence": 0, "error": "File not found"})
                image_paths.append("")

        # FIXED: Add extracted text columns properly
        text_df = pd.DataFrame(text_data)
        for col in text_df.columns:
            valid_df[f'extracted_{col}'] = text_df[col].values

        # Step 3: Enhance descriptions
        print("\nüìù ENHANCING DESCRIPTIONS")
        print("-" * 40)
        valid_df = self.enhance_titles_with_context(valid_df)

        # Step 4: Create embeddings
        print("\nüß† CREATING EMBEDDINGS")
        print("-" * 40)

        # Prepare texts for embedding
        texts_for_embedding = valid_df['enhanced_description'].fillna('').tolist()
        valid_image_paths = [path for path in image_paths if path and os.path.exists(path)]

        # Create text embeddings
        text_embeddings = self.create_text_embeddings(texts_for_embedding)

        # Create image embeddings
        image_embeddings = self.create_image_embeddings(valid_image_paths)

        # Step 5: Create search index
        print("\nüîç CREATING SEARCH INDEX")
        print("-" * 40)

        search_index = self.create_search_index(valid_df, text_embeddings, image_embeddings)

        # Step 6: Save processed data
        print("\nüíæ SAVING PROCESSED DATA")
        print("-" * 40)

        # Save processed dataframe
        processed_csv_path = self.processed_path / "processed_greentext_data.csv"
        valid_df.to_csv(processed_csv_path, index=False)
        print(f"‚úÖ Saved processed data: {processed_csv_path}")

        # Save embeddings
        embeddings_path = self.processed_path / "embeddings.npz"
        np.savez_compressed(embeddings_path,
                          text_embeddings=text_embeddings,
                          image_embeddings=image_embeddings,
                          combined_embeddings=search_index['embeddings'])
        print(f"‚úÖ Saved embeddings: {embeddings_path}")

        # Save search index
        index_path = self.processed_path / "search_index.pkl"
        with open(index_path, 'wb') as f:
            pickle.dump(search_index, f)
        print(f"‚úÖ Saved search index: {index_path}")

        # Create train/test split for ML training
        if len(valid_df) >= 10:
            train_df, test_df = train_test_split(valid_df, test_size=0.2, random_state=42)

            train_path = self.processed_path / "train_data.csv"
            test_path = self.processed_path / "test_data.csv"

            train_df.to_csv(train_path, index=False)
            test_df.to_csv(test_path, index=False)

            print(f"‚úÖ Saved train set: {train_path} ({len(train_df)} samples)")
            print(f"‚úÖ Saved test set: {test_path} ({len(test_df)} samples)")

        # Step 7: Generate summary
        print("\nüìä PREPROCESSING SUMMARY")
        print("=" * 50)
        print(f"‚úÖ Total images processed: {len(valid_df)}")
        print(f"üî§ Text extracted from: {len([d for d in text_data if d['text']])}")
        print(f"üß† Text embeddings created: {text_embeddings.shape}")
        print(f"üñºÔ∏è Image embeddings created: {image_embeddings.shape}")
        print(f"üîç Search index ready with {len(search_index['metadata'])} items")
        print(f"üíæ All data saved to: {self.processed_path}")

        return {
            'processed_df': valid_df,
            'text_embeddings': text_embeddings,
            'image_embeddings': image_embeddings,
            'search_index': search_index,
            'processed_path': self.processed_path
        }

# ============== USAGE EXAMPLE ==============
def run_preprocessing_pipeline(dataset_csv_path: str, images_directory: str):
    """
    Run the complete preprocessing pipeline

    Args:
        dataset_csv_path: Path to your scraped dataset CSV
        images_directory: Path to directory containing downloaded images
    """

    print("üî• GREENTEXT CLIP PREPROCESSING")
    print("=" * 50)

    try:
        # Initialize preprocessor
        preprocessor = GreentextCLIPPreprocessor(dataset_csv_path, images_directory)

        # Run complete pipeline
        results = preprocessor.run_complete_preprocessing()

        if results:
            print("\nüéâ PREPROCESSING COMPLETED SUCCESSFULLY!")
            print("üöÄ Your data is now ready for CLIP-based search!")
            print("\nüìã Next steps:")
            print("1. Use the search_index.pkl for building your search engine")
            print("2. Use processed_greentext_data.csv for additional ML tasks")
            print("3. Use train_data.csv and test_data.csv for model training")

            return results
        else:
            print("‚ùå Preprocessing failed")
            return None

    except Exception as e:
        print(f"‚ùå Error during preprocessing: {e}")
        import traceback
        traceback.print_exc()
        return None

# ============== CONFIGURATION ==============
# Update these paths to match your scraped data
DATASET_CSV_PATH = "/content/drive/MyDrive/greentext_ml_dataset/metadata/greentext_complete.csv"  # Path to your scraped CSV
IMAGES_DIRECTORY = "/content/drive/MyDrive/greentext_ml_dataset/images"  # Path to your images folder

# ============== RUN PIPELINE ==============
if __name__ == "__main__":
    print("üöÄ Starting CLIP preprocessing pipeline...")
    results = run_preprocessing_pipeline(DATASET_CSV_PATH, IMAGES_DIRECTORY)

    if results:
        print(f"\n‚ú® Preprocessing completed! Check {results['processed_path']} for output files.")
    else:
        print("\n‚ùå Preprocessing failed. Please check the error messages above.")

üì¶ Installing required packages...
‚úÖ CLIP installed successfully!
üöÄ Starting CLIP preprocessing pipeline...
üî• GREENTEXT CLIP PREPROCESSING
üìä Loaded dataset with 898 records
üñ•Ô∏è Using device: cuda
‚úÖ CLIP model loaded successfully
üöÄ STARTING GREENTEXT CLIP PREPROCESSING PIPELINE

üìä DATASET QUALITY ANALYSIS
‚úÖ Successfully downloaded images: 898
‚ùå Failed downloads: 0

üìà Score Statistics:
   Average score: 7089.0
   Median score: 7447.0
   Score range: 52 - 49570

üñºÔ∏è Image Dimensions:
   Average width: 861px
   Average height: 935px
   Width range: 161 - 4640
   Height range: 81 - 4586

üìù Title Analysis:
   Average title length: 22.9 characters
   Title length range: 2 - 84

üîç EXTRACTING TEXT FROM IMAGES
----------------------------------------


Processing images:   0%|          | 0/898 [00:00<?, ?it/s]


üìù ENHANCING DESCRIPTIONS
----------------------------------------

üß† CREATING EMBEDDINGS
----------------------------------------
üîÑ Creating text embeddings...


  0%|          | 0/29 [00:00<?, ?it/s]

üîÑ Creating image embeddings...


  0%|          | 0/57 [00:00<?, ?it/s]


üîç CREATING SEARCH INDEX
----------------------------------------

üíæ SAVING PROCESSED DATA
----------------------------------------
‚úÖ Saved processed data: /content/drive/MyDrive/greentext_ml_dataset/metadata/processed/processed_greentext_data.csv
‚úÖ Saved embeddings: /content/drive/MyDrive/greentext_ml_dataset/metadata/processed/embeddings.npz
‚úÖ Saved search index: /content/drive/MyDrive/greentext_ml_dataset/metadata/processed/search_index.pkl
‚úÖ Saved train set: /content/drive/MyDrive/greentext_ml_dataset/metadata/processed/train_data.csv (718 samples)
‚úÖ Saved test set: /content/drive/MyDrive/greentext_ml_dataset/metadata/processed/test_data.csv (180 samples)

üìä PREPROCESSING SUMMARY
‚úÖ Total images processed: 898
üî§ Text extracted from: 0
üß† Text embeddings created: (898, 512)
üñºÔ∏è Image embeddings created: (898, 512)
üîç Search index ready with 898 items
üíæ All data saved to: /content/drive/MyDrive/greentext_ml_dataset/metadata/processed

üéâ PREPROCESS