In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import cv2
from PIL import Image
import warnings
import time
from typing import Dict, List, Tuple, Optional, Union
import glob
import json
import logging
from datetime import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from io import BytesIO, StringIO

# ML imports
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                             ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, classification_report, roc_auc_score, 
                           roc_curve, auc, confusion_matrix, precision_recall_curve,
                           f1_score, precision_score, recall_score)
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist

# Optional imports
try:
    import umap
    UMAP_AVAILABLE = True
    print("✅ UMAP available")
except ImportError:
    UMAP_AVAILABLE = False
    print("⚠️ UMAP not available")

try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks
    from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
    from tensorflow.keras.applications import EfficientNetB0
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    TF_AVAILABLE = True
    print("✅ TensorFlow available")
except ImportError:
    TF_AVAILABLE = False
    print("⚠️ TensorFlow not available")

# Suppress warnings
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📦 All libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm
2025-08-21 22:34:02.465305: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


✅ UMAP available
✅ TensorFlow available
📦 All libraries imported successfully!


In [5]:
class Config:
    """Enhanced configuration with all settings"""
    
    # Base paths - Local data path
    BASE_PATH = Path('./pipeline')  # Local path for VSCode
    OUTPUT_BASE = Path('./output')
    
    HOLE_SIZES = ['20mm', '25mm', '30mm', '40mm']
    
    # Data formats
    EXPECTED_COLUMNS = ['nodenumber', 'x-coordinate', 'y-coordinate', 'temperature']
    
    # Image processing
    IMG_HEIGHT = 224
    IMG_WIDTH = 224
    IMG_CHANNELS = 3
    SUPPORTED_IMG_FORMATS = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']
    
    # Model parameters
    TEST_SIZE = 0.2
    VAL_SIZE = 0.2
    RANDOM_STATE = 42
    
    # Training parameters
    BATCH_SIZE = 32
    EPOCHS = 50  # Reduced for faster training
    LEARNING_RATE = 0.001
    PATIENCE = 10
    
    # Feature engineering parameters
    TEMPERATURE_THRESHOLD = 310  # Kelvin
    SPATIAL_ZONES = 20
    ROLLING_WINDOW = 5
    
    # Output paths
    OUTPUT_DIR = OUTPUT_BASE
    MODEL_DIR = OUTPUT_DIR / 'models'
    PLOTS_DIR = OUTPUT_DIR / 'plots'
    DATA_DIR = OUTPUT_DIR / 'processed_data'

# Create output directories
def setup_directories():
    """Setup all required directories"""
    for dir_path in [Config.OUTPUT_DIR, Config.MODEL_DIR, Config.PLOTS_DIR, Config.DATA_DIR]:
        dir_path.mkdir(parents=True, exist_ok=True)
    return True

setup_directories()
print(f"📁 Output directories created in: {Config.OUTPUT_DIR}")
print(f"📊 Looking for data in: {Config.BASE_PATH}")

# %% [markdown]
# ## 📊 Enhanced Data Loader

# %%
class EnhancedDataLoader:
    """Enhanced data loader with pattern recognition"""
    
    @staticmethod
    def discover_data_files():
        """Discover all data files with pattern recognition"""
        print("🔍 Discovering data files...")
        discovered_files = {}
        
        for hole_size in Config.HOLE_SIZES:
            hole_path = Config.BASE_PATH / hole_size
            discovered_files[hole_size] = {
                'data_files': [],
                'image_files': [],
                'file_count': 0,
                'image_count': 0
            }
            
            if hole_path.exists():
                print(f"  📂 Found directory: {hole_path}")
                # Find data files
                all_files = list(hole_path.iterdir())
                data_files = []
                
                for file in all_files:
                    if file.is_file():
                        file_name = file.name.lower()
                        
                        # Skip hidden files and known non-data files
                        if file_name.startswith('.'):
                            continue
                            
                        # Check if it's likely a data file
                        if (hole_size.replace('mm', '') in file_name or 
                            'test' in file_name or 
                            any(char.isdigit() for char in file_name)):
                            data_files.append(file)
                
                discovered_files[hole_size]['data_files'] = data_files
                discovered_files[hole_size]['file_count'] = len(data_files)
                print(f"    📄 Found {len(data_files)} data files")
                
                # Find image files
                contours_path = hole_path / 'contours'
                if contours_path.exists():
                    image_files = []
                    for ext in Config.SUPPORTED_IMG_FORMATS:
                        image_files.extend(list(contours_path.glob(f'*{ext}')))
                        image_files.extend(list(contours_path.glob(f'*{ext.upper()}')))
                    
                    discovered_files[hole_size]['image_files'] = image_files
                    discovered_files[hole_size]['image_count'] = len(image_files)
                    print(f"    🖼️ Found {len(image_files)} image files")
        
        total_files = sum([data['file_count'] for data in discovered_files.values()])
        total_images = sum([data['image_count'] for data in discovered_files.values()])
        print(f"\n✅ Discovery complete: {total_files} data files, {total_images} images")
        
        return discovered_files
    
    @staticmethod
    def parse_data_file(file_path: Path) -> pd.DataFrame:
        """Parse data file with advanced format detection"""
        try:
            # Try different parsing methods
            parsing_methods = [
                lambda: pd.read_csv(file_path, sep=',', header=None),
                lambda: pd.read_csv(file_path, sep='\s+', header=None),
                lambda: pd.read_csv(file_path, sep='\t', header=None),
                lambda: pd.read_csv(file_path, sep=None, engine='python', header=None),
            ]
            
            df = None
            for method in parsing_methods:
                try:
                    df = method()
                    if len(df.columns) >= 4 and len(df) > 0:
                        break
                except:
                    continue
            
            if df is None or df.empty:
                return pd.DataFrame()
            
            # Assign column names
            if len(df.columns) >= 4:
                df.columns = Config.EXPECTED_COLUMNS[:len(df.columns)]
                
                if len(df.columns) > 4:
                    additional_cols = [f'feature_{i}' for i in range(4, len(df.columns))]
                    df.columns = Config.EXPECTED_COLUMNS + additional_cols
            
            # Data cleaning
            df = df.dropna()
            
            # Ensure numeric columns
            numeric_columns = ['x-coordinate', 'y-coordinate', 'temperature']
            for col in numeric_columns:
                if col in df.columns:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            
            df = df.dropna()
            
            # Validate temperature values
            if 'temperature' in df.columns:
                df = df[(df['temperature'] > 200) & (df['temperature'] < 1000)]
            
            return df
            
        except Exception as e:
            print(f"⚠️ Could not parse {file_path.name}: {e}")
            return pd.DataFrame()
    
    @staticmethod
    def advanced_leak_classification(df: pd.DataFrame, hole_size: str) -> Dict:
        """Advanced leak classification with multiple criteria"""
        if df.empty or 'temperature' not in df.columns:
            return {'leak_status': 0, 'confidence': 0.0, 'criteria_met': []}
        
        temp_stats = {
            'mean': df['temperature'].mean(),
            'max': df['temperature'].max(),
            'std': df['temperature'].std(),
            'q95': df['temperature'].quantile(0.95),
        }
        
        # Hole size factors
        hole_factors = {'20mm': 1.0, '25mm': 1.2, '30mm': 1.4, '40mm': 1.6}
        factor = hole_factors.get(hole_size, 1.0)
        
        # Classification criteria
        criteria = {}
        criteria_met = []
        
        # Criterion 1: Mean temperature
        temp_threshold = Config.TEMPERATURE_THRESHOLD * factor
        criteria['mean_temp'] = temp_stats['mean'] > temp_threshold
        if criteria['mean_temp']:
            criteria_met.append(f"Mean temp ({temp_stats['mean']:.1f}K) > {temp_threshold:.1f}K")
        
        # Criterion 2: Maximum temperature
        max_threshold = (Config.TEMPERATURE_THRESHOLD + 30) * factor
        criteria['max_temp'] = temp_stats['max'] > max_threshold
        if criteria['max_temp']:
            criteria_met.append(f"Max temp ({temp_stats['max']:.1f}K) > {max_threshold:.1f}K")
        
        # Criterion 3: Temperature variability
        std_threshold = 15 * factor
        criteria['temp_variance'] = temp_stats['std'] > std_threshold
        if criteria['temp_variance']:
            criteria_met.append(f"Temp std ({temp_stats['std']:.1f}) > {std_threshold:.1f}")
        
        # Decision logic
        criteria_count = sum(criteria.values())
        total_criteria = len(criteria)
        
        is_leak = criteria_count >= (total_criteria // 2 + 1)
        confidence = criteria_count / total_criteria
        
        return {
            'leak_status': int(is_leak),
            'confidence': confidence,
            'criteria_met': criteria_met,
            'criteria_count': criteria_count
        }

📁 Output directories created in: output
📊 Looking for data in: pipeline


In [3]:
data_loader = EnhancedDataLoader()

# Discover data files
discovered_files = data_loader.discover_data_files()

# Load all data
print("\n📊 Loading and processing data...")
all_dataframes = []

total_files = sum([data['file_count'] for data in discovered_files.values()])

if total_files == 0:
    print("❌ No data files found! Please ensure data is in './data/' directory")
    print("Expected structure:")
    print("./data/20mm/")
    print("./data/25mm/")
    print("./data/30mm/")
    print("./data/40mm/")
else:
    file_count = 0
    for hole_size, file_data in discovered_files.items():
        if file_data['file_count'] == 0:
            continue
        
        print(f"  Processing {hole_size} files...")
        for file_path in file_data['data_files']:
            df = data_loader.parse_data_file(file_path)
            if not df.empty:
                # Classify leaks
                leak_info = data_loader.advanced_leak_classification(df, hole_size)
                df['leak_status'] = leak_info['leak_status']
                df['leak_confidence'] = leak_info['confidence']
                df['hole_size'] = hole_size
                df['file_name'] = file_path.name
                all_dataframes.append(df)
                
                print(f"    ✅ {file_path.name}: {len(df)} samples, Leak: {'Yes' if leak_info['leak_status'] else 'No'}")
            
            file_count += 1
    
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
        print(f"\n🎉 Successfully loaded {len(combined_df):,} total data points from {len(all_dataframes)} files")
        
        # Display summary statistics
        print("\n📈 Dataset Summary:")
        print(f"  Total samples: {len(combined_df):,}")
        print(f"  Features: {len(combined_df.columns)}")
        print(f"  Leak detections: {combined_df['leak_status'].sum():,}")
        print(f"  No leak: {(combined_df['leak_status'] == 0).sum():,}")
        print(f"  Leak rate: {combined_df['leak_status'].mean():.1%}")
        
        # Show data info
        print("\n📋 Data Info:")
        combined_df.info()
        
        # Show first few rows
        print("\n👀 First 5 rows:")
        display(combined_df.head())
        
    else:
        print("❌ No data could be loaded successfully!")
        combined_df = pd.DataFrame()


🔍 Discovering data files...


NameError: name 'Config' is not defined