# Hybrid Autocorrect Model Training

This notebook implements a comprehensive hybrid approach for training autocorrect models:

1. **Rule-based corrections** using fuzzy matching
2. **ML models** (sklearn) for pattern-based correction
3. **Transformer models** for contextual correction
4. **Synthetic dataset generation** for training data
5. **Model evaluation and comparison**
6. **Correction mapping storage** for real-time inference

## Data Sources:
- `vehicle_master.csv` (correct reference data)
- Synthetic incorrect→correct pairs
- Existing user input corrections

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import json
import random
import string
import warnings
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Fuzzy matching and string similarity
from fuzzywuzzy import fuzz, process
from jellyfish import levenshtein_distance, jaro_winkler_similarity

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Deep Learning (optional - if transformers available)
try:
    from transformers import AutoTokenizer, AutoModel, pipeline
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("Transformers not available. Will use sklearn models only.")

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)

print("✅ Libraries imported successfully")

In [None]:
# Setup paths and load data
DATA_PATH = Path('../data')
MODEL_PATH = Path('.')

# Load vehicle master data
vehicle_master = pd.read_csv(DATA_PATH / 'vehicle_master_cleaned.csv')
user_inputs = pd.read_csv(DATA_PATH / 'user_inputs_year_validated.csv')

print(f"📊 Vehicle Master Data: {len(vehicle_master)} records")
print(f"📊 User Input Data: {len(user_inputs)} records")

# Display data structure
print("\n🔍 Vehicle Master Columns:", vehicle_master.columns.tolist())
print("🔍 User Input Columns:", user_inputs.columns.tolist())

vehicle_master.head()

## 1. Synthetic Dataset Generation

Generate incorrect→correct pairs for training ML models

In [None]:
class SyntheticDataGenerator:
    def __init__(self, vehicle_master_df):
        self.vehicle_master = vehicle_master_df
        self.brands = vehicle_master_df['brand'].unique().tolist()
        self.models = vehicle_master_df['model'].unique().tolist()
        
    def introduce_typos(self, text, num_typos=1):
        """Introduce various types of typos"""
        if not text or len(text) < 2:
            return text
            
        text = str(text).lower()
        typo_methods = [
            self._character_substitution,
            self._character_deletion,
            self._character_insertion,
            self._character_transposition,
            self._ocr_errors,
            self._keyboard_errors
        ]
        
        corrupted = text
        for _ in range(num_typos):
            method = random.choice(typo_methods)
            corrupted = method(corrupted)
            
        return corrupted
    
    def _character_substitution(self, text):
        if len(text) < 1:
            return text
        pos = random.randint(0, len(text) - 1)
        new_char = random.choice(string.ascii_lowercase)
        return text[:pos] + new_char + text[pos+1:]
    
    def _character_deletion(self, text):
        if len(text) < 2:
            return text
        pos = random.randint(0, len(text) - 1)
        return text[:pos] + text[pos+1:]
    
    def _character_insertion(self, text):
        pos = random.randint(0, len(text))
        new_char = random.choice(string.ascii_lowercase)
        return text[:pos] + new_char + text[pos:]
    
    def _character_transposition(self, text):
        if len(text) < 2:
            return text
        pos = random.randint(0, len(text) - 2)
        chars = list(text)
        chars[pos], chars[pos + 1] = chars[pos + 1], chars[pos]
        return ''.join(chars)
    
    def _ocr_errors(self, text):
        ocr_substitutions = {
            'o': '0', '0': 'o', 'i': '1', '1': 'i', 'l': '1',
            's': '5', '5': 's', 'b': '6', '6': 'b', 'g': '9',
            'O': '0', 'I': '1', 'S': '5', 'B': '8'
        }
        
        if not text:
            return text
            
        pos = random.randint(0, len(text) - 1)
        char = text[pos]
        if char in ocr_substitutions:
            return text[:pos] + ocr_substitutions[char] + text[pos+1:]
        return text
    
    def _keyboard_errors(self, text):
        keyboard_map = {
            'q': ['w', 'a'], 'w': ['q', 'e', 's'], 'e': ['w', 'r', 'd'],
            'r': ['e', 't', 'f'], 't': ['r', 'y', 'g'], 'y': ['t', 'u', 'h'],
            'u': ['y', 'i', 'j'], 'i': ['u', 'o', 'k'], 'o': ['i', 'p', 'l'],
            'p': ['o', 'l'], 'a': ['q', 's', 'z'], 's': ['a', 'd', 'w', 'x'],
            'd': ['s', 'f', 'e', 'c'], 'f': ['d', 'g', 'r', 'v'],
            'g': ['f', 'h', 't', 'b'], 'h': ['g', 'j', 'y', 'n'],
            'j': ['h', 'k', 'u', 'm'], 'k': ['j', 'l', 'i'],
            'l': ['k', 'o', 'p'], 'z': ['a', 'x'], 'x': ['z', 'c', 's'],
            'c': ['x', 'v', 'd'], 'v': ['c', 'b', 'f'], 'b': ['v', 'n', 'g'],
            'n': ['b', 'm', 'h'], 'm': ['n', 'j']
        }
        
        if not text:
            return text
            
        pos = random.randint(0, len(text) - 1)
        char = text[pos].lower()
        if char in keyboard_map:
            new_char = random.choice(keyboard_map[char])
            return text[:pos] + new_char + text[pos+1:]
        return text
    
    def generate_synthetic_dataset(self, num_samples=5000):
        """Generate synthetic incorrect→correct pairs"""
        synthetic_data = []
        
        # Generate brand corrections
        for _ in range(num_samples // 3):
            correct_brand = random.choice(self.brands)
            incorrect_brand = self.introduce_typos(correct_brand, random.randint(1, 2))
            
            synthetic_data.append({
                'input_text': incorrect_brand,
                'correct_text': correct_brand,
                'field_type': 'brand',
                'correction_type': 'synthetic'
            })
        
        # Generate model corrections
        for _ in range(num_samples // 3):
            correct_model = random.choice(self.models)
            incorrect_model = self.introduce_typos(correct_model, random.randint(1, 2))
            
            synthetic_data.append({
                'input_text': incorrect_model,
                'correct_text': correct_model,
                'field_type': 'model',
                'correction_type': 'synthetic'
            })
        
        # Generate combined brand+model corrections
        for _ in range(num_samples // 3):
            correct_brand = random.choice(self.brands)
            correct_model = random.choice(self.models)
            combined_text = f"{correct_brand} {correct_model}"
            
            # Introduce errors in the combined text
            incorrect_combined = self.introduce_typos(combined_text, random.randint(1, 3))
            
            synthetic_data.append({
                'input_text': incorrect_combined,
                'correct_text': combined_text,
                'field_type': 'combined',
                'correction_type': 'synthetic'
            })
        
        return pd.DataFrame(synthetic_data)

# Generate synthetic dataset
generator = SyntheticDataGenerator(vehicle_master)
synthetic_df = generator.generate_synthetic_dataset(num_samples=6000)

print(f"📊 Generated {len(synthetic_df)} synthetic training examples")
print(f"📊 Field type distribution:")
print(synthetic_df['field_type'].value_counts())

# Show examples
print("\n🔍 Sample synthetic corrections:")
synthetic_df.head(10)

## 2. Rule-Based Fuzzy Matching System

Implement the baseline rule-based correction system

In [None]:
class RuleBasedCorrector:
    def __init__(self, vehicle_master_df):
        self.vehicle_master = vehicle_master_df
        self.brands = vehicle_master_df['brand'].unique().tolist()
        self.models = vehicle_master_df['model'].unique().tolist()
        
    def fuzzy_correct(self, input_text, candidates, threshold=0.6):
        """Find best fuzzy match from candidates"""
        if not input_text or not candidates:
            return None, 0.0
            
        # Use multiple fuzzy matching algorithms
        best_match = process.extractOne(input_text, candidates)
        
        if best_match and best_match[1] >= threshold * 100:
            return best_match[0], best_match[1] / 100.0
        
        return None, 0.0
    
    def correct_brand(self, input_brand, threshold=0.6):
        return self.fuzzy_correct(input_brand, self.brands, threshold)
    
    def correct_model(self, input_model, threshold=0.6):
        return self.fuzzy_correct(input_model, self.models, threshold)
    
    def evaluate_on_synthetic(self, synthetic_df, threshold=0.6):
        """Evaluate rule-based approach on synthetic data"""
        correct_predictions = 0
        total_predictions = 0
        
        results = []
        
        for _, row in synthetic_df.iterrows():
            input_text = row['input_text']
            correct_text = row['correct_text']
            field_type = row['field_type']
            
            if field_type == 'brand':
                prediction, confidence = self.correct_brand(input_text, threshold)
            elif field_type == 'model':
                prediction, confidence = self.correct_model(input_text, threshold)
            else:  # combined
                # For combined, try to match against all brands and models
                all_candidates = self.brands + self.models
                prediction, confidence = self.fuzzy_correct(input_text, all_candidates, threshold)
            
            is_correct = prediction == correct_text if prediction else False
            
            results.append({
                'input': input_text,
                'correct': correct_text,
                'predicted': prediction,
                'confidence': confidence,
                'is_correct': is_correct,
                'field_type': field_type
            })
            
            if prediction:
                total_predictions += 1
                if is_correct:
                    correct_predictions += 1
        
        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        coverage = total_predictions / len(synthetic_df)
        
        return {
            'accuracy': accuracy,
            'coverage': coverage,
            'correct_predictions': correct_predictions,
            'total_predictions': total_predictions,
            'results': pd.DataFrame(results)
        }

# Test rule-based approach
rule_corrector = RuleBasedCorrector(vehicle_master)

# Evaluate on synthetic data
rule_results = rule_corrector.evaluate_on_synthetic(synthetic_df, threshold=0.6)

print(f"📊 RULE-BASED FUZZY MATCHING RESULTS:")
print(f"   Accuracy: {rule_results['accuracy']:.3f}")
print(f"   Coverage: {rule_results['coverage']:.3f}")
print(f"   Correct Predictions: {rule_results['correct_predictions']}")
print(f"   Total Predictions: {rule_results['total_predictions']}")

# Show some examples
print("\n🔍 Sample rule-based corrections:")
rule_results['results'].head(10)

## 3. Machine Learning Models (sklearn)

Train various ML models for pattern-based correction

In [None]:
class MLCorrector:
    def __init__(self, vehicle_master_df):
        self.vehicle_master = vehicle_master_df
        self.brands = vehicle_master_df['brand'].unique().tolist()
        self.models = vehicle_master_df['model'].unique().tolist()
        self.all_targets = self.brands + self.models
        
        # Create label mappings
        self.target_to_label = {target: i for i, target in enumerate(self.all_targets)}
        self.label_to_target = {i: target for target, i in self.target_to_label.items()}
        
        self.models_dict = {}
        self.vectorizers = {}
    
    def extract_features(self, text):
        """Extract character-level and word-level features"""
        if not text:
            return []
            
        text = str(text).lower()
        
        features = []
        
        # Character n-grams (2-4)
        for n in range(2, 5):
            for i in range(len(text) - n + 1):
                features.append(f"char_{n}_{text[i:i+n]}")
        
        # Word features
        features.extend([f"word_{word}" for word in text.split()])
        
        # Length features
        features.append(f"len_{len(text)}")
        
        # First/last character
        if text:
            features.append(f"first_{text[0]}")
            features.append(f"last_{text[-1]}")
        
        return features
    
    def prepare_training_data(self, synthetic_df):
        """Prepare training data for ML models"""
        X = []
        y = []
        
        for _, row in synthetic_df.iterrows():
            input_text = row['input_text']
            correct_text = row['correct_text']
            
            if correct_text in self.target_to_label:
                X.append(input_text)
                y.append(self.target_to_label[correct_text])
        
        return X, y
    
    def train_models(self, synthetic_df):
        """Train multiple ML models"""
        print("🔧 Preparing training data...")
        X, y = self.prepare_training_data(synthetic_df)
        
        print(f"📊 Training data: {len(X)} samples, {len(set(y))} classes")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Define models to train
        models_to_train = {
            'random_forest': {
                'vectorizer': TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=10000),
                'model': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
            },
            'gradient_boosting': {
                'vectorizer': TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000),
                'model': GradientBoostingClassifier(n_estimators=100, random_state=42)
            },
            'svm': {
                'vectorizer': TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000),
                'model': SVC(kernel='rbf', probability=True, random_state=42)
            },
            'naive_bayes': {
                'vectorizer': CountVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000),
                'model': MultinomialNB()
            }
        }
        
        results = {}
        
        for model_name, config in models_to_train.items():
            print(f"\n🔧 Training {model_name}...")
            
            # Create pipeline
            pipeline = Pipeline([
                ('vectorizer', config['vectorizer']),
                ('classifier', config['model'])
            ])
            
            # Train model
            pipeline.fit(X_train, y_train)
            
            # Evaluate
            train_score = pipeline.score(X_train, y_train)
            test_score = pipeline.score(X_test, y_test)
            
            # Cross-validation
            cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
            
            results[model_name] = {
                'pipeline': pipeline,
                'train_score': train_score,
                'test_score': test_score,
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std()
            }
            
            print(f"   Train Accuracy: {train_score:.3f}")
            print(f"   Test Accuracy: {test_score:.3f}")
            print(f"   CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
        
        self.models_dict = results
        self.X_test = X_test
        self.y_test = y_test
        
        return results
    
    def predict_correction(self, input_text, model_name='random_forest', threshold=0.5):
        """Predict correction using trained model"""
        if model_name not in self.models_dict:
            return None, 0.0
            
        pipeline = self.models_dict[model_name]['pipeline']
        
        # Get prediction probabilities
        proba = pipeline.predict_proba([input_text])[0]
        
        # Get best prediction
        best_idx = np.argmax(proba)
        best_confidence = proba[best_idx]
        
        if best_confidence >= threshold:
            predicted_label = pipeline.predict([input_text])[0]
            predicted_text = self.label_to_target[predicted_label]
            return predicted_text, best_confidence
        
        return None, best_confidence

# Train ML models
ml_corrector = MLCorrector(vehicle_master)
ml_results = ml_corrector.train_models(synthetic_df)

print("\n📊 ML MODEL TRAINING COMPLETE")
print("\n🏆 Model Performance Summary:")
for model_name, results in ml_results.items():
    print(f"   {model_name}: Test={results['test_score']:.3f}, CV={results['cv_mean']:.3f}")

# Test predictions
print("\n🔍 Sample ML predictions:")
test_inputs = ['toyot', 'hond', 'camr', 'civicy', 'nisssan']
for test_input in test_inputs:
    prediction, confidence = ml_corrector.predict_correction(test_input, 'random_forest')
    print(f"   '{test_input}' → '{prediction}' (confidence: {confidence:.3f})")


## 4. Transformer-Based Contextual Correction (Optional)

Use pre-trained transformers for contextual understanding

In [None]:
class TransformerCorrector:
    def __init__(self, vehicle_master_df):
        self.vehicle_master = vehicle_master_df
        self.brands = vehicle_master_df['brand'].unique().tolist()
        self.models = vehicle_master_df['model'].unique().tolist()
        self.all_targets = self.brands + self.models
        
        if TRANSFORMERS_AVAILABLE:
            self.setup_transformer()
        else:
            print("⚠️ Transformers not available. Skipping transformer-based correction.")
    
    def setup_transformer(self):
        """Setup transformer model for similarity"""
        try:
            # Use a lightweight model for similarity
            model_name = 'sentence-transformers/all-MiniLM-L6-v2'
            
            # Try to load sentence transformer
            try:
                from sentence_transformers import SentenceTransformer
                self.model = SentenceTransformer(model_name)
                self.model_type = 'sentence_transformer'
                print(f"✅ Loaded SentenceTransformer: {model_name}")
            except ImportError:
                # Fallback to basic transformers
                self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
                self.model = AutoModel.from_pretrained('distilbert-base-uncased')
                self.model_type = 'basic_transformer'
                print("✅ Loaded basic transformer: distilbert-base-uncased")
                
        except Exception as e:
            print(f"❌ Error loading transformer: {e}")
            self.model = None
            self.model_type = None
    
    def get_embeddings(self, texts):
        """Get embeddings for texts"""
        if not TRANSFORMERS_AVAILABLE or self.model is None:
            return None
            
        try:
            if self.model_type == 'sentence_transformer':
                return self.model.encode(texts)
            else:
                # Basic transformer approach
                embeddings = []
                for text in texts:
                    inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
                    with torch.no_grad():
                        outputs = self.model(**inputs)
                        # Use mean pooling
                        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
                        embeddings.append(embedding)
                return np.array(embeddings)
        except Exception as e:
            print(f"❌ Error getting embeddings: {e}")
            return None
    
    def find_best_match(self, input_text, threshold=0.7):
        """Find best match using transformer embeddings"""
        if not TRANSFORMERS_AVAILABLE or self.model is None:
            return None, 0.0
            
        try:
            # Get embeddings
            input_embedding = self.get_embeddings([input_text])
            target_embeddings = self.get_embeddings(self.all_targets)
            
            if input_embedding is None or target_embeddings is None:
                return None, 0.0
            
            # Calculate cosine similarities
            from sklearn.metrics.pairwise import cosine_similarity
            similarities = cosine_similarity(input_embedding, target_embeddings)[0]
            
            # Find best match
            best_idx = np.argmax(similarities)
            best_score = similarities[best_idx]
            
            if best_score >= threshold:
                return self.all_targets[best_idx], best_score
            
            return None, best_score
            
        except Exception as e:
            print(f"❌ Error in transformer matching: {e}")
            return None, 0.0

# Initialize transformer corrector
transformer_corrector = TransformerCorrector(vehicle_master)

if TRANSFORMERS_AVAILABLE:
    print("\n🔍 Testing transformer-based corrections:")
    test_inputs = ['toyot', 'hond', 'camr', 'civicy', 'nisssan']
    for test_input in test_inputs:
        prediction, confidence = transformer_corrector.find_best_match(test_input)
        print(f"   '{test_input}' → '{prediction}' (confidence: {confidence:.3f})")
else:
    print("⚠️ Transformer-based correction skipped (transformers not available)")


## 5. Hybrid Model Ensemble

Combine rule-based, ML, and transformer approaches

In [None]:
class HybridCorrector:
    def __init__(self, rule_corrector, ml_corrector, transformer_corrector=None):
        self.rule_corrector = rule_corrector
        self.ml_corrector = ml_corrector
        self.transformer_corrector = transformer_corrector
        
        # Weights for ensemble (can be tuned)
        self.weights = {
            'rule': 0.3,
            'ml': 0.5,
            'transformer': 0.2
        }
    
    def predict_correction(self, input_text, field_type='auto', confidence_threshold=0.6):
        """Predict correction using ensemble approach"""
        predictions = {}
        
        # Rule-based prediction
        if field_type == 'brand' or field_type == 'auto':
            rule_pred, rule_conf = self.rule_corrector.correct_brand(input_text)
        elif field_type == 'model':
            rule_pred, rule_conf = self.rule_corrector.correct_model(input_text)
        else:
            # Try both brand and model
            brand_pred, brand_conf = self.rule_corrector.correct_brand(input_text)
            model_pred, model_conf = self.rule_corrector.correct_model(input_text)
            
            if brand_conf > model_conf:
                rule_pred, rule_conf = brand_pred, brand_conf
            else:
                rule_pred, rule_conf = model_pred, model_conf
        
        predictions['rule'] = {'prediction': rule_pred, 'confidence': rule_conf}
        
        # ML prediction
        ml_pred, ml_conf = self.ml_corrector.predict_correction(input_text, 'random_forest')
        predictions['ml'] = {'prediction': ml_pred, 'confidence': ml_conf}
        
        # Transformer prediction (if available)
        if self.transformer_corrector and TRANSFORMERS_AVAILABLE:
            trans_pred, trans_conf = self.transformer_corrector.find_best_match(input_text)
            predictions['transformer'] = {'prediction': trans_pred, 'confidence': trans_conf}
        else:
            predictions['transformer'] = {'prediction': None, 'confidence': 0.0}
        
        # Ensemble decision
        return self._ensemble_decision(predictions, confidence_threshold)
    
    def _ensemble_decision(self, predictions, confidence_threshold):
        """Make ensemble decision from multiple predictions"""
        # Collect valid predictions
        valid_predictions = {}
        
        for method, pred_data in predictions.items():
            if pred_data['prediction'] and pred_data['confidence'] > 0:
                valid_predictions[method] = pred_data
        
        if not valid_predictions:
            return None, 0.0, predictions
        
        # Weighted voting
        candidate_scores = {}
        
        for method, pred_data in valid_predictions.items():
            prediction = pred_data['prediction']
            confidence = pred_data['confidence']
            weight = self.weights.get(method, 0.1)
            
            weighted_score = confidence * weight
            
            if prediction in candidate_scores:
                candidate_scores[prediction] += weighted_score
            else:
                candidate_scores[prediction] = weighted_score
        
        # Find best candidate
        if candidate_scores:
            best_prediction = max(candidate_scores, key=candidate_scores.get)
            best_score = candidate_scores[best_prediction]
            
            # Normalize score to [0, 1]
            max_possible_score = sum(self.weights.values())
            normalized_score = best_score / max_possible_score
            
            if normalized_score >= confidence_threshold:
                return best_prediction, normalized_score, predictions
        
        return None, 0.0, predictions
    
    def evaluate_on_synthetic(self, synthetic_df, confidence_threshold=0.6):
        """Evaluate hybrid approach on synthetic data"""
        results = []
        correct_predictions = 0
        total_predictions = 0
        
        for _, row in synthetic_df.iterrows():
            input_text = row['input_text']
            correct_text = row['correct_text']
            field_type = row['field_type']
            
            prediction, confidence, all_predictions = self.predict_correction(
                input_text, field_type, confidence_threshold
            )
            
            is_correct = prediction == correct_text if prediction else False
            
            results.append({
                'input': input_text,
                'correct': correct_text,
                'predicted': prediction,
                'confidence': confidence,
                'is_correct': is_correct,
                'field_type': field_type,
                'rule_pred': all_predictions['rule']['prediction'],
                'ml_pred': all_predictions['ml']['prediction'],
                'transformer_pred': all_predictions['transformer']['prediction']
            })
            
            if prediction:
                total_predictions += 1
                if is_correct:
                    correct_predictions += 1
        
        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        coverage = total_predictions / len(synthetic_df)
        
        return {
            'accuracy': accuracy,
            'coverage': coverage,
            'correct_predictions': correct_predictions,
            'total_predictions': total_predictions,
            'results': pd.DataFrame(results)
        }

# Create hybrid corrector
hybrid_corrector = HybridCorrector(
    rule_corrector, 
    ml_corrector, 
    transformer_corrector if TRANSFORMERS_AVAILABLE else None
)

# Evaluate hybrid approach
print("🔧 Evaluating hybrid ensemble approach...")
hybrid_results = hybrid_corrector.evaluate_on_synthetic(synthetic_df, confidence_threshold=0.6)

print(f"\n📊 HYBRID ENSEMBLE RESULTS:")
print(f"   Accuracy: {hybrid_results['accuracy']:.3f}")
print(f"   Coverage: {hybrid_results['coverage']:.3f}")
print(f"   Correct Predictions: {hybrid_results['correct_predictions']}")
print(f"   Total Predictions: {hybrid_results['total_predictions']}")

# Test hybrid predictions
print("\n🔍 Sample hybrid predictions:")
test_inputs = ['toyot', 'hond', 'camr', 'civicy', 'nisssan', 'perodua', 'myvi']
for test_input in test_inputs:
    prediction, confidence, all_preds = hybrid_corrector.predict_correction(test_input)
    print(f"   '{test_input}' → '{prediction}' (confidence: {confidence:.3f})")


## 6. Model Comparison and Evaluation

Compare all approaches and select the best performing model

In [None]:
# Compare all approaches
print("📊 MODEL COMPARISON SUMMARY")
print("=" * 50)

comparison_data = []

# Rule-based results
comparison_data.append({
    'Model': 'Rule-based (Fuzzy)',
    'Accuracy': rule_results['accuracy'],
    'Coverage': rule_results['coverage'],
    'F1-Score': rule_results['accuracy'] * rule_results['coverage']  # Approximation
})

# ML model results
for model_name, results in ml_results.items():
    comparison_data.append({
        'Model': f'ML ({model_name})',
        'Accuracy': results['test_score'],
        'Coverage': 1.0,  # ML models always make predictions
        'F1-Score': results['test_score']  # Approximation
    })

# Hybrid results
comparison_data.append({
    'Model': 'Hybrid Ensemble',
    'Accuracy': hybrid_results['accuracy'],
    'Coverage': hybrid_results['coverage'],
    'F1-Score': hybrid_results['accuracy'] * hybrid_results['coverage']
})

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print(comparison_df.to_string(index=False))

# Visualize comparison
plt.figure(figsize=(12, 8))

# Accuracy comparison
plt.subplot(2, 2, 1)
plt.bar(comparison_df['Model'], comparison_df['Accuracy'])
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylabel('Accuracy')

# Coverage comparison
plt.subplot(2, 2, 2)
plt.bar(comparison_df['Model'], comparison_df['Coverage'])
plt.title('Model Coverage Comparison')
plt.xticks(rotation=45)
plt.ylabel('Coverage')

# F1-Score comparison
plt.subplot(2, 2, 3)
plt.bar(comparison_df['Model'], comparison_df['F1-Score'])
plt.title('Model F1-Score Comparison')
plt.xticks(rotation=45)
plt.ylabel('F1-Score')

# Performance by field type
plt.subplot(2, 2, 4)
field_performance = hybrid_results['results'].groupby('field_type')['is_correct'].mean()
plt.bar(field_performance.index, field_performance.values)
plt.title('Hybrid Model Performance by Field Type')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.show()

# Best model selection
best_model = comparison_df.iloc[0]
print(f"\n🏆 BEST PERFORMING MODEL: {best_model['Model']}")
print(f"   Accuracy: {best_model['Accuracy']:.3f}")
print(f"   Coverage: {best_model['Coverage']:.3f}")
print(f"   F1-Score: {best_model['F1-Score']:.3f}")


## 7. Correction Mapping Storage

Create and store correction mappings for real-time inference

In [None]:
class CorrectionMappingStorage:
    def __init__(self, model_path='.'):
        self.model_path = Path(model_path)
        self.correction_mappings = {}
        self.model_metadata = {}
    
    def build_correction_mappings(self, synthetic_df, hybrid_corrector, confidence_threshold=0.7):
        """Build correction mappings from synthetic data"""
        print("🔧 Building correction mappings...")
        
        mappings = {
            'direct_mappings': {},  # High-confidence direct mappings
            'fuzzy_mappings': {},   # Lower-confidence fuzzy mappings
            'pattern_mappings': {}  # Pattern-based mappings
        }
        
        high_confidence_count = 0
        medium_confidence_count = 0
        
        for _, row in synthetic_df.iterrows():
            input_text = row['input_text']
            correct_text = row['correct_text']
            
            # Get hybrid prediction
            prediction, confidence, all_preds = hybrid_corrector.predict_correction(
                input_text, confidence_threshold=0.5
            )
            
            if prediction == correct_text:
                if confidence >= confidence_threshold:
                    # High confidence - direct mapping
                    mappings['direct_mappings'][input_text] = {
                        'correction': correct_text,
                        'confidence': confidence,
                        'method': 'hybrid'
                    }
                    high_confidence_count += 1
                else:
                    # Medium confidence - fuzzy mapping
                    mappings['fuzzy_mappings'][input_text] = {
                        'correction': correct_text,
                        'confidence': confidence,
                        'method': 'hybrid'
                    }
                    medium_confidence_count += 1
        
        # Add pattern-based mappings
        self._build_pattern_mappings(mappings, synthetic_df)
        
        self.correction_mappings = mappings
        
        print(f"📊 Correction mappings built:")
        print(f"   Direct mappings: {high_confidence_count}")
        print(f"   Fuzzy mappings: {medium_confidence_count}")
        print(f"   Pattern mappings: {len(mappings['pattern_mappings'])}")
        
        return mappings
    
    def _build_pattern_mappings(self, mappings, synthetic_df):
        """Build pattern-based mappings for common error types"""
        pattern_mappings = {}
        
        # OCR error patterns
        ocr_patterns = {
            'o': '0', '0': 'o', 'i': '1', '1': 'i', 'l': '1',
            's': '5', '5': 's', 'b': '6', '6': 'b'
        }
        
        for old_char, new_char in ocr_patterns.items():
            pattern_mappings[f'ocr_{old_char}_to_{new_char}'] = {
                'pattern_type': 'ocr_substitution',
                'from_char': old_char,
                'to_char': new_char
            }
        
        mappings['pattern_mappings'] = pattern_mappings
    
    def save_models_and_mappings(self, hybrid_corrector, ml_corrector):
        """Save all models and mappings to disk"""
        print("💾 Saving models and mappings...")
        
        # Save correction mappings
        mappings_file = self.model_path / 'correction_mappings.json'
        with open(mappings_file, 'w') as f:
            json.dump(self.correction_mappings, f, indent=2)
        
        # Save ML models
        for model_name, model_data in ml_corrector.models_dict.items():
            model_file = self.model_path / f'ml_model_{model_name}.pkl'
            with open(model_file, 'wb') as f:
                pickle.dump(model_data['pipeline'], f)
        
        # Save model metadata
        metadata = {
            'model_info': {
                'training_date': pd.Timestamp.now().isoformat(),
                'synthetic_samples': len(synthetic_df),
                'vehicle_brands': len(hybrid_corrector.rule_corrector.brands),
                'vehicle_models': len(hybrid_corrector.rule_corrector.models)
            },
            'performance': {
                'hybrid_accuracy': hybrid_results['accuracy'],
                'hybrid_coverage': hybrid_results['coverage'],
                'best_ml_model': max(ml_results.items(), key=lambda x: x[1]['test_score'])[0]
            },
            'model_weights': hybrid_corrector.weights
        }
        
        metadata_file = self.model_path / 'model_metadata.json'
        with open(metadata_file, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        # Save vehicle master data for reference
        reference_file = self.model_path / 'vehicle_reference.csv'
        hybrid_corrector.rule_corrector.vehicle_master.to_csv(reference_file, index=False)
        
        print(f"✅ Models and mappings saved to {self.model_path}")
        print(f"   - correction_mappings.json")
        print(f"   - ml_model_*.pkl files")
        print(f"   - model_metadata.json")
        print(f"   - vehicle_reference.csv")
        
        return {
            'mappings_file': str(mappings_file),
            'metadata_file': str(metadata_file),
            'reference_file': str(reference_file)
        }

# Create and use correction mapping storage
storage = CorrectionMappingStorage(MODEL_PATH)
mappings = storage.build_correction_mappings(synthetic_df, hybrid_corrector, confidence_threshold=0.7)
saved_files = storage.save_models_and_mappings(hybrid_corrector, ml_corrector)

print("\n📁 Saved files:")
for file_type, file_path in saved_files.items():
    print(f"   {file_type}: {file_path}")

## 8. Production Integration Guide

Instructions for integrating the trained models into production

In [None]:
class ProductionAutocorrect:
    def __init__(self, model_path='.'):
        self.model_path = Path(model_path)
        self.load_models_and_mappings()
    
    def load_models_and_mappings(self):
        """Load trained models and mappings for production use"""
        # Load correction mappings
        mappings_file = self.model_path / 'correction_mappings.json'
        with open(mappings_file, 'r') as f:
            self.correction_mappings = json.load(f)
        
        # Load model metadata
        metadata_file = self.model_path / 'model_metadata.json'
        with open(metadata_file, 'r') as f:
            self.metadata = json.load(f)
        
        # Load best ML model
        best_model_name = self.metadata['performance']['best_ml_model']
        model_file = self.model_path / f'ml_model_{best_model_name}.pkl'
        with open(model_file, 'rb') as f:
            self.ml_model = pickle.load(f)
        
        # Load vehicle reference data
        reference_file = self.model_path / 'vehicle_reference.csv'
        self.vehicle_reference = pd.read_csv(reference_file)
        self.brands = self.vehicle_reference['brand'].unique().tolist()
        self.models = self.vehicle_reference['model'].unique().tolist()
        
        print(f"✅ Production autocorrect loaded:")
        print(f"   Best ML model: {best_model_name}")
        print(f"   Direct mappings: {len(self.correction_mappings['direct_mappings'])}")
        print(f"   Fuzzy mappings: {len(self.correction_mappings['fuzzy_mappings'])}")
        print(f"   Vehicle brands: {len(self.brands)}")
        print(f"   Vehicle models: {len(self.models)}")
    
    def correct_text(self, input_text, confidence_threshold=0.6):
        """Production-ready text correction"""
        if not input_text:
            return input_text, 0.0, 'no_input'
        
        input_text = str(input_text).strip().lower()
        
        # 1. Check direct mappings first (fastest)
        if input_text in self.correction_mappings['direct_mappings']:
            mapping = self.correction_mappings['direct_mappings'][input_text]
            return mapping['correction'], mapping['confidence'], 'direct_mapping'
        
        # 2. Check fuzzy mappings
        if input_text in self.correction_mappings['fuzzy_mappings']:
            mapping = self.correction_mappings['fuzzy_mappings'][input_text]
            if mapping['confidence'] >= confidence_threshold:
                return mapping['correction'], mapping['confidence'], 'fuzzy_mapping'
        
        # 3. Use ML model for prediction
        try:
            ml_proba = self.ml_model.predict_proba([input_text])[0]
            best_idx = np.argmax(ml_proba)
            ml_confidence = ml_proba[best_idx]
            
            if ml_confidence >= confidence_threshold:
                ml_prediction = self.ml_model.predict([input_text])[0]
                # Convert label back to text (assuming we have the mapping)
                all_targets = self.brands + self.models
                if ml_prediction < len(all_targets):
                    predicted_text = all_targets[ml_prediction]
                    return predicted_text, ml_confidence, 'ml_model'
        except Exception as e:
            print(f"ML prediction error: {e}")
        
        # 4. Fallback to fuzzy matching
        best_match = process.extractOne(input_text, self.brands + self.models)
        if best_match and best_match[1] >= confidence_threshold * 100:
            return best_match[0], best_match[1] / 100.0, 'fuzzy_fallback'
        
        # 5. No correction found
        return input_text, 0.0, 'no_correction'
    
    def batch_correct(self, input_list, confidence_threshold=0.6):
        """Batch correction for multiple inputs"""
        results = []
        
        for input_text in input_list:
            corrected, confidence, method = self.correct_text(input_text, confidence_threshold)
            results.append({
                'input': input_text,
                'corrected': corrected,
                'confidence': confidence,
                'method': method,
                'was_corrected': corrected != input_text
            })
        
        return pd.DataFrame(results)

# Example production usage
print("🚀 PRODUCTION INTEGRATION EXAMPLE")
print("=" * 40)

# Initialize production autocorrect
try:
    prod_corrector = ProductionAutocorrect(MODEL_PATH)
    
    # Test production corrections
    test_inputs = ['toyot', 'hond', 'camr', 'civicy', 'nisssan', 'myvi', 'perodua']
    
    print("\n🔍 Production correction examples:")
    for test_input in test_inputs:
        corrected, confidence, method = prod_corrector.correct_text(test_input)
        print(f"   '{test_input}' → '{corrected}' (conf: {confidence:.3f}, method: {method})")
    
    # Batch correction example
    batch_results = prod_corrector.batch_correct(test_inputs)
    print(f"\n📊 Batch correction summary:")
    print(f"   Total inputs: {len(batch_results)}")
    print(f"   Corrections made: {batch_results['was_corrected'].sum()}")
    print(f"   Average confidence: {batch_results['confidence'].mean():.3f}")
    
except FileNotFoundError:
    print("⚠️ Model files not found. Please run the training sections first.")
except Exception as e:
    print(f"❌ Error loading production models: {e}")

## 9. Summary and Next Steps

Training completion summary and recommendations

In [None]:
print("🎯 HYBRID AUTOCORRECT MODEL TRAINING COMPLETE")
print("=" * 60)

print("📋 TRAINING SUMMARY:")
print(f"   • Synthetic dataset: {len(synthetic_df)} samples")
print(f"   • Vehicle brands: {len(vehicle_master['brand'].unique())}")
print(f"   • Vehicle models: {len(vehicle_master['model'].unique())}")
print(f"   • ML models trained: {len(ml_results)}")

print("🏆 BEST MODEL PERFORMANCE:")
best_model_name = max(ml_results.items(), key=lambda x: x[1]['test_score'])[0]
best_model_score = ml_results[best_model_name]['test_score']
print(f"   • Best ML model: {best_model_name} ({best_model_score:.3f} accuracy)")
print(f"   • Hybrid accuracy: {hybrid_results['accuracy']:.3f}")
print(f"   • Hybrid coverage: {hybrid_results['coverage']:.3f}")

print("💾 SAVED ARTIFACTS:")
print("   • correction_mappings.json - Direct and fuzzy correction mappings")
print("   • ml_model_*.pkl - Trained ML models (Random Forest, SVM, etc.)")
print("   • model_metadata.json - Training metadata and performance metrics")
print("   • vehicle_reference.csv - Reference vehicle data")

print("🚀 PRODUCTION INTEGRATION:")
print("   1. Use ProductionAutocorrect class for real-time corrections")
print("   2. Adjust confidence thresholds based on your requirements")
print("   3. Monitor correction accuracy and retrain as needed")
print("   4. Consider A/B testing different model combinations")

print("🔄 NEXT STEPS:")
print("   • Deploy models to production environment")
print("   • Set up monitoring for correction accuracy")
print("   • Collect real user corrections for model improvement")
print("   • Consider fine-tuning transformer models with domain data")
print("   • Implement feedback loop for continuous learning")

print("✅ Training pipeline completed successfully!")