In [5]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, applications
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

def analyze_competition_structure():
    """Analyze the competition structure to understand what we need to submit"""
    print("ANALYZING COMPETITION STRUCTURE")
    print("=" * 60)
    
    base_dir = Path("/kaggle/input/soil-classification/soil_competition-2025")
    test_dir = base_dir / "test"
    sample_submission_path = base_dir / "sample_submission.csv"
    test_ids_path = base_dir / "test_ids.csv"
    
    # Check sample submission
    print("Sample submission analysis:")
    if sample_submission_path.exists():
        sample_df = pd.read_csv(sample_submission_path)
        print(f"  Sample submission entries: {len(sample_df)}")
        print(f"  Columns: {list(sample_df.columns)}")
        print(f"  First few entries:")
        print(sample_df.head())
    
    # Check test_ids.csv if it exists
    print("\nTest IDs file analysis:")
    if test_ids_path.exists():
        test_ids_df = pd.read_csv(test_ids_path)
        print(f"  Test IDs file entries: {len(test_ids_df)}")
        print(f"  Columns: {list(test_ids_df.columns)}")
        print(f"  First few entries:")
        print(test_ids_df.head())
    else:
        print("  No test_ids.csv file found")
    
    # Check actual test directory
    print(f"\nTest directory analysis:")
    if test_dir.exists():
        test_files = [f.name for f in test_dir.glob("*") if f.is_file()]
        print(f"  Actual test files found: {len(test_files)}")
        print(f"  First 10 test files:")
        for i, file in enumerate(test_files[:10]):
            print(f"    {i+1}. {file}")
        
        return test_files
    else:
        print("  Test directory not found")
        return []

# Run analysis first
test_files = analyze_competition_structure()

class KaggleSoilClassificationPipeline:
    def __init__(self, base_dir="/kaggle/input/soil-classification/soil_competition-2025", 
                 img_size=(384, 384), batch_size=32):
        """
        Initialize the soil classification pipeline for Kaggle environment
        """
        self.base_dir = Path(base_dir)
        self.img_size = img_size
        self.batch_size = batch_size
        
        # Kaggle paths
        self.train_dir = self.base_dir / "train"
        self.test_dir = self.base_dir / "test"
        self.train_labels_path = self.base_dir / "train_labels.csv"
        self.sample_submission_path = self.base_dir / "sample_submission.csv"
        self.test_ids_path = self.base_dir / "test_ids.csv"
        
        # Output directory for Kaggle
        self.output_dir = Path("/kaggle/working")
        
        # Model storage
        self.models = {}
        self.class_weights = None
        
        # Setup GPU for Kaggle
        self.setup_kaggle_gpu()
        
    def setup_kaggle_gpu(self):
        """Setup GPU configuration for Kaggle"""
        print("\nSetting up Kaggle GPU configuration...")
        
        # Check GPU availability
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            try:
                # Enable memory growth
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                
                # Enable mixed precision for faster training on Kaggle
                policy = tf.keras.mixed_precision.Policy('mixed_float16')
                tf.keras.mixed_precision.set_global_policy(policy)
                
                print(f"Kaggle GPU setup complete. Found {len(gpus)} GPU(s)")
                print("Mixed precision enabled for faster training")
                
            except RuntimeError as e:
                print(f"GPU setup error: {e}")
        else:
            print("No GPU found, using CPU (will be much slower)")
            
    def load_and_analyze_data(self):
        """Load training data and perform analysis"""
        print("\nLoading and analyzing dataset...")
        
        # Load labels
        train_df = pd.read_csv(self.train_labels_path)
        print(f"Loaded {len(train_df)} training labels")
        
        # Convert labels to strings for ImageDataGenerator
        train_df['label_str'] = train_df['label'].astype(str)
        
        # Create full paths and filter existing files
        train_df['full_path'] = train_df['image_id'].apply(
            lambda x: str(self.train_dir / x)
        )
        
        # Check which files actually exist
        existing_mask = train_df['full_path'].apply(os.path.exists)
        train_df = train_df[existing_mask].reset_index(drop=True)
        
        print(f"Dataset Statistics:")
        print(f"  Total training images found: {len(train_df)}")
        
        class_counts = train_df['label'].value_counts().sort_index()
        for label, count in class_counts.items():
            percentage = (count / len(train_df)) * 100
            class_name = "Non-Soil" if label == 0 else "Soil"
            print(f"    Class {label} ({class_name}): {count} images ({percentage:.1f}%)")
        
        # Check if we have both classes
        unique_labels = train_df['label'].unique()
        if len(unique_labels) == 1:
            print("WARNING: Only one class found in training data!")
            self.handle_single_class_dataset(train_df)
        
        # Calculate class weights for imbalanced dataset
        if len(unique_labels) > 1:
            self.class_weights = compute_class_weight(
                'balanced',
                classes=np.unique(train_df['label']),
                y=train_df['label']
            )
            self.class_weights = dict(zip(np.unique(train_df['label']), self.class_weights))
        else:
            self.class_weights = None
        
        return train_df
    
    def handle_single_class_dataset(self, train_df):
        """Handle the case where we only have one class in training data"""
        print("\nHandling single-class dataset...")
        
        n_samples = len(train_df)
        n_negative = min(200, n_samples // 4)  # 25% as negative examples
        
        # Randomly select some indices to be negative
        negative_indices = np.random.choice(n_samples, n_negative, replace=False)
        
        # Create negative examples
        train_df.loc[negative_indices, 'label'] = 0
        train_df.loc[negative_indices, 'label_str'] = '0'
        
        print(f"  Created {n_negative} artificial negative examples")
        
        # Update class distribution
        class_counts = train_df['label'].value_counts().sort_index()
        for label, count in class_counts.items():
            percentage = (count / len(train_df)) * 100
            class_name = "Non-Soil" if label == 0 else "Soil"
            print(f"    Class {label} ({class_name}): {count} images ({percentage:.1f}%)")
    
    def create_data_generators(self, train_df, validation_split=0.2):
        """Create optimized data generators with augmentation"""
        print(f"\nCreating data generators...")
        
        # Split data stratified by class
        train_split, val_split = train_test_split(
            train_df, 
            test_size=validation_split, 
            stratify=train_df['label'],
            random_state=42
        )
        
        print(f"  Training samples: {len(train_split)}")
        print(f"  Validation samples: {len(val_split)}")
        
        # Enhanced training data generator
        train_datagen = ImageDataGenerator(
            rescale=1./255,
            rotation_range=30,
            width_shift_range=0.2,
            height_shift_range=0.2,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            vertical_flip=True,
            brightness_range=[0.8, 1.2],
            fill_mode='nearest'
        )
        
        # Validation data generator
        val_datagen = ImageDataGenerator(rescale=1./255)
        
        # Create generators using string labels
        train_generator = train_datagen.flow_from_dataframe(
            train_split,
            x_col='full_path',
            y_col='label_str',
            target_size=self.img_size,
            batch_size=self.batch_size,
            class_mode='binary',
            shuffle=True,
            seed=42
        )
        
        val_generator = val_datagen.flow_from_dataframe(
            val_split,
            x_col='full_path',
            y_col='label_str',
            target_size=self.img_size,
            batch_size=self.batch_size,
            class_mode='binary',
            shuffle=False,
            seed=42
        )
        
        return train_generator, val_generator
    
    def create_efficientnet_model(self, version='B0'):
        """Create EfficientNet model with custom head"""
        print(f"\nBuilding EfficientNet{version} model...")
        
        if version == 'B0':
            base_model = applications.EfficientNetB0(
                weights='imagenet',
                include_top=False,
                input_shape=(*self.img_size, 3)
            )
        else:
            raise ValueError(f"Unsupported EfficientNet version: {version}")
        
        # Freeze base model initially
        base_model.trainable = False
        
        # Create model
        model = models.Sequential([
            base_model,
            layers.GlobalAveragePooling2D(),
            layers.BatchNormalization(),
            layers.Dropout(0.4),
            layers.Dense(256, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(128, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            layers.Dense(1, activation='sigmoid', dtype='float32')
        ])
        
        print(f"EfficientNet{version} model created successfully")
        print(f"  Total parameters: {model.count_params():,}")
        
        return model
    
    def compile_model(self, model, learning_rate=0.001):
        """Compile model"""
        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=learning_rate,
            weight_decay=0.0001
        )
        
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
        )
        
        return model
    
    def train_model(self, model, train_generator, val_generator, epochs=12):
        """Train model with early stopping"""
        print(f"\nTraining model...")
        
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=4,
                restore_best_weights=True,
                verbose=1
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=2,
                min_lr=1e-7,
                verbose=1
            )
        ]
        
        history = model.fit(
            train_generator,
            epochs=epochs,
            validation_data=val_generator,
            callbacks=callbacks,
            class_weight=self.class_weights,
            verbose=1
        )
        
        return history
    
    def evaluate_model(self, model, val_generator):
        """Evaluate model performance"""
        print("\nEvaluating model...")
        
        # Get predictions
        val_generator.reset()
        predictions = model.predict(val_generator, verbose=1)
        
        # Test different thresholds for best F1 score
        thresholds = np.arange(0.1, 0.9, 0.05)
        f1_scores = []
        
        true_labels = val_generator.classes[:len(predictions)]
        
        for threshold in thresholds:
            y_pred = (predictions > threshold).astype(int)
            f1 = f1_score(true_labels, y_pred, zero_division=1)
            f1_scores.append(f1)
        
        # Find best threshold
        best_idx = np.argmax(f1_scores)
        best_threshold = thresholds[best_idx]
        best_f1 = f1_scores[best_idx]
        
        print(f"  Best threshold: {best_threshold:.3f}")
        print(f"  Best F1 score: {best_f1:.4f}")
        
        # Final predictions with best threshold
        predicted_classes = (predictions > best_threshold).astype(int).flatten()
        accuracy = np.mean(predicted_classes == true_labels)
        
        print(f"  Accuracy: {accuracy:.4f}")
        
        return best_threshold, best_f1
    
    def create_complete_test_predictions(self, model, threshold=0.5):
        """Generate predictions for ALL test images"""
        print(f"\nGenerating predictions for ALL test images...")
        
        # Get ALL test files from the test directory
        test_files = [f.name for f in self.test_dir.glob("*") if f.is_file()]
        test_files.sort()  # Sort for consistent ordering
        
        print(f"Found {len(test_files)} test images")
        
        if not test_files:
            print("No test files found!")
            return pd.DataFrame()
        
        # Create dataframe for ALL test files
        test_df = pd.DataFrame({'image_id': test_files})
        test_df['full_path'] = test_df['image_id'].apply(lambda x: str(self.test_dir / x))
        
        print(f"Processing {len(test_df)} test images...")
        
        # Create test generator
        test_datagen = ImageDataGenerator(rescale=1./255)
        test_generator = test_datagen.flow_from_dataframe(
            test_df,
            x_col='full_path',
            y_col=None,
            target_size=self.img_size,
            batch_size=self.batch_size,
            class_mode=None,
            shuffle=False  # Important: keep order for correct mapping
        )
        
        # Generate predictions
        print("Generating predictions...")
        predictions = model.predict(test_generator, verbose=1)
        predicted_labels = (predictions > threshold).astype(int).flatten()
        
        # Create submission dataframe
        submission_df = pd.DataFrame({
            'image_id': test_files,
            'label': predicted_labels
        })
        
        print(f"Generated predictions for {len(submission_df)} images")
        
        # Display statistics
        label_counts = submission_df['label'].value_counts().sort_index()
        for label, count in label_counts.items():
            percentage = (count / len(submission_df)) * 100
            class_name = "Non-Soil" if label == 0 else "Soil"
            print(f"  Class {label} ({class_name}): {count} predictions ({percentage:.1f}%)")
        
        return submission_df
    
    def save_submission(self, submission_df, filename):
        """Save submission file"""
        submission_path = self.output_dir / filename
        submission_df.to_csv(submission_path, index=False)
        print(f"Submission saved to: {submission_path}")
        print(f"Total entries: {len(submission_df)}")
    
    def run_pipeline(self):
        """Run the complete training pipeline"""
        print("KAGGLE SOIL CLASSIFICATION PIPELINE")
        print("=" * 60)
        
        # 1. Load and analyze data
        train_df = self.load_and_analyze_data()
        
        # 2. Create data generators
        train_gen, val_gen = self.create_data_generators(train_df)
        
        # 3. Create and train model
        model = self.create_efficientnet_model('B0')
        model = self.compile_model(model)
        
        # Train the model
        history = self.train_model(model, train_gen, val_gen, epochs=12)
        
        # 4. Evaluate model
        best_threshold, best_f1 = self.evaluate_model(model, val_gen)
        
        # 5. Generate predictions for ALL test images
        submission = self.create_complete_test_predictions(model, threshold=best_threshold)
        
        # 6. Save submission with unique filename
        import datetime
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"soil_classification_submission_{timestamp}.csv"
        
        self.save_submission(submission, filename)
        
        # Also save as simple submission.csv for convenience
        self.save_submission(submission, "submission.csv")
        
        print(f"\nPipeline complete!")
        print(f"F1 Score: {best_f1:.4f}")
        print(f"Optimal threshold: {best_threshold:.3f}")
        print(f"Submission ready with {len(submission)} entries")
        print(f"Files saved: {filename} and submission.csv")
        
        return model, submission

def main():
    """Main function"""
    pipeline = KaggleSoilClassificationPipeline()
    model, submission = pipeline.run_pipeline()
    
    print("\nSuccess! Check your submission files.")
    
    # Display final submission preview
    print(f"\nSubmission preview (showing first 10 and last 5 of {len(submission)} total):")
    print(submission.head(10))
    print("...")
    print(submission.tail(5))

if __name__ == "__main__":
    main()

ANALYZING COMPETITION STRUCTURE
Sample submission analysis:
  Sample submission entries: 4
  Columns: ['image_id', 'label']
  First few entries:
                               image_id  label
0  6595f1266325552489c7d1635fafb88f.jpg      0
1  4b614841803d5448b59e2c6ca74ea664.jpg      1
2  ca30e008692a50638b43d944f46245c8.jpg      0
3  e432d7988d125c8497d41b7ff223b187.jpg      1

Test IDs file analysis:
  Test IDs file entries: 967
  Columns: ['image_id']
  First few entries:
                               image_id
0  6595f1266325552489c7d1635fafb88f.jpg
1  4b614841803d5448b59e2c6ca74ea664.jpg
2  ca30e008692a50638b43d944f46245c8.jpg
3  6a9046a219425f7599729be627df1c1a.jpg
4  97c1e0276d2d5c2f88dddbc87357611e.jpg

Test directory analysis:
  Actual test files found: 967
  First 10 test files:
    1. 465084323936570da664f0ca8dc90326.jpg
    2. 1aa0b12029d35e778dba5bff1255c638.jpg
    3. 6df2c3dcd4fb59298c7a73467ea72eeb.jpg
    4. 107f25ebd87f581ea57c630a2dcdf50c.jpg
    5. dc35d58782615e4f95