In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for date features
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_columns):
        self.date_columns = date_columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        result = pd.DataFrame(index=X_copy.index)
        
        for col in self.date_columns:
            if col in X_copy.columns:
                # Convert to datetime
                X_copy[col] = pd.to_datetime(X_copy[col], errors='coerce')
                
                # Extract features
                if not X_copy[col].isna().all():
                    result[f'{col}_year'] = X_copy[col].dt.year
                    result[f'{col}_month'] = X_copy[col].dt.month
                    result[f'{col}_day'] = X_copy[col].dt.day
                    result[f'{col}_dayofweek'] = X_copy[col].dt.dayofweek
                    
                    # Calculate days since a reference date
                    result[f'{col}_days_since_2000'] = (X_copy[col] - pd.Timestamp('2000-01-01')).dt.days
        
        return result

# Custom transformer for text features
class TextFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, text_columns, max_features=1000):
        self.text_columns = text_columns
        self.max_features = max_features
        self.vectorizers = {}
        
    def fit(self, X, y=None):
        X_copy = X.copy()
        
        for col in self.text_columns:
            if col in X_copy.columns:
                vectorizer = TfidfVectorizer(
                    max_features=self.max_features,
                    stop_words='english',
                    min_df=5,
                    ngram_range=(1, 2)
                )
                # Fill NaN values with empty string
                text_data = X_copy[col].fillna('')
                vectorizer.fit(text_data)
                self.vectorizers[col] = vectorizer
                
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        result = pd.DataFrame(index=X_copy.index)
        
        for col, vectorizer in self.vectorizers.items():
            if col in X_copy.columns:
                # Fill NaN values with empty string
                text_data = X_copy[col].fillna('')
                # Transform text to TF-IDF features
                text_features = vectorizer.transform(text_data)
                
                # Convert sparse matrix to DataFrame
                feature_names = [f'{col}_{name}' for name in vectorizer.get_feature_names_out()]
                text_df = pd.DataFrame.sparse.from_spmatrix(
                    text_features,
                    index=X_copy.index,
                    columns=feature_names
                )
                
                # Join with results
                result = pd.concat([result, text_df], axis=1)
                
        return result



# Custom transformer for components features
class ComponentsFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.component_encoder = None
        
    def fit(self, X, y=None):
        if 'components' in X.columns:
            # Extract unique components from the components column
            all_components = []
            for comp_list in X['components'].dropna():
                try:
                    # Assuming components is stored as a string representation of a list
                    if isinstance(comp_list, str):
                        # Try to clean and split the string
                        comp_list = comp_list.replace('[', '').replace(']', '').replace("'", "")
                        components = [c.strip() for c in comp_list.split(',')]
                        all_components.extend(components)
                except:
                    pass
            
            # Get unique components
            unique_components = list(set(all_components))
            self.component_encoder = {comp: i for i, comp in enumerate(unique_components)}
        
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        result = pd.DataFrame(index=X_copy.index)
        
        if 'components' in X_copy.columns and self.component_encoder:
            # Initialize component features with zeros
            for comp in self.component_encoder:
                result[f'component_{comp}'] = 0
            
            # Fill in component features
            for idx, comp_list in X_copy['components'].dropna().items():
                try:
                    if isinstance(comp_list, str):
                        comp_list = comp_list.replace('[', '').replace(']', '').replace("'", "")
                        components = [c.strip() for c in comp_list.split(',')]
                        for comp in components:
                            if comp in self.component_encoder:
                                result.loc[idx, f'component_{comp}'] = 1
                except:
                    pass
        
        # Add a component count feature
        if 'components' in X_copy.columns:
            result['component_count'] = X_copy['components'].apply(
                lambda x: len(str(x).split(',')) if pd.notna(x) else 0
            )
        
        return result

def preprocess_and_train_complaint_model(df, target_column='recall_status'):
    """
    Preprocess and train a model using only complaint features.
    
    Parameters:
    -----------
    df : DataFrame containing the data
    target_column : Name of the target column
    
    Returns:
    --------
    Trained model, preprocessors, and evaluation results
    """
    # Make a copy of the dataset
    data = df.copy()
    
    # Define complaint features to keep
    complaint_features = [
        'odiNumber', 'crash', 'fire', 'numberOfInjuries', 'numberOfDeaths', 
        'dateOfIncident', 'dateComplaintFiled', 'incident_filing_lag', 
        'components', 'summary_complaint', 'products', 'Model', 'ModelYear'
    ]
    
    # Keep only complaint features and target
    features_to_use = [col for col in complaint_features if col in data.columns]
    features_to_use.append(target_column)
    data = data[features_to_use]
    
    # Convert date columns to datetime
    date_columns = ['dateOfIncident', 'dateComplaintFiled']
    for col in date_columns:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
    
    # Handle incident_filing_lag missing values
    if 'dateOfIncident' in data.columns and 'dateComplaintFiled' in data.columns:
        # Calculate lag where missing
        mask = data['incident_filing_lag'].isna() & ~data['dateOfIncident'].isna() & ~data['dateComplaintFiled'].isna()
        data.loc[mask, 'incident_filing_lag'] = (data.loc[mask, 'dateComplaintFiled'] - 
                                               data.loc[mask, 'dateOfIncident']).dt.days
    
    # Create additional features
    current_year = datetime.now().year
    if 'ModelYear' in data.columns:
        data['vehicle_age'] = current_year - data['ModelYear']
    
    # Create severity score
    if 'numberOfInjuries' in data.columns and 'numberOfDeaths' in data.columns:
        data['severity_score'] = data['numberOfInjuries'] + data['numberOfDeaths'] * 5
    
    # Define feature groups
    numeric_features = [
        'numberOfInjuries', 'numberOfDeaths', 'incident_filing_lag',
        'vehicle_age', 'ModelYear'
    ]
    
    categorical_features = [
        'Model'
    ]
    
    text_features = [
        'summary_complaint', 'products'
    ]
    
    date_features = [
        'dateOfIncident', 'dateComplaintFiled'
    ]
    
    binary_features = [
        'crash', 'fire'
    ]
    
    # Ensure we only use features that exist in the data
    numeric_features = [col for col in numeric_features if col in data.columns]
    categorical_features = [col for col in categorical_features if col in data.columns]
    text_features = [col for col in text_features if col in data.columns]
    date_features = [col for col in date_features if col in data.columns]
    binary_features = [col for col in binary_features if col in data.columns]
    
    # Separate features and target
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42, stratify=y
    )
    
    # Define preprocessing for numeric features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Define preprocessing for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Create column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('bin', 'passthrough', binary_features)
        ],
        remainder='drop'
    )
    
    # Create specialized feature extractors
    date_extractor = DateFeatureExtractor(date_columns=date_features)
    text_extractor = TextFeatureExtractor(text_columns=text_features, max_features=300)
    component_extractor = ComponentsFeatureExtractor()
    
    # Apply main preprocessing
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)
    
    # Apply specialized extractors
    X_train_dates = date_extractor.fit_transform(X_train)
    X_test_dates = date_extractor.transform(X_test)
    
    X_train_text = text_extractor.fit_transform(X_train)
    X_test_text = text_extractor.transform(X_test)

    
    X_train_components = component_extractor.fit_transform(X_train)
    X_test_components = component_extractor.transform(X_test)
    
    # Get feature names from preprocessor
    feature_names = []
    for name, trans, cols in preprocessor.transformers_:
        if name != 'remainder' and trans != 'drop':
            if name == 'cat':
                # Get feature names for categorical columns after one-hot encoding
                cat_features = []
                for i, col in enumerate(cols):
                    try:
                        cat_values = trans.named_steps['onehot'].categories_[i]
                        cat_features.extend([f"{col}_{val}" for val in cat_values])
                    except:
                        # Handle case where a column might be empty
                        pass
                feature_names.extend(cat_features)
            else:
                # For numeric and binary columns, keep original names
                feature_names.extend(cols)
    
    # Convert preprocessed data to DataFrames
    X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, index=X_train.index, columns=feature_names)
    X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, index=X_test.index, columns=feature_names)
    
    # Combine all features
    X_train_final = pd.concat([
        X_train_preprocessed_df, X_train_dates, X_train_text, 
        X_train_components
    ], axis=1)
    
    X_test_final = pd.concat([
        X_test_preprocessed_df, X_test_dates, X_test_text,
        X_test_components
    ], axis=1)
    
    # Define models to try
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42),
        'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
    }
    
    # Train and evaluate models
    results = {}
    best_model = None
    best_accuracy = 0
    
    for name, model in models.items():
        print(f"Training {name}...")
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train_final, y_train, cv=cv, scoring='accuracy')
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV accuracy: {cv_scores.mean():.4f}")
        
        # Train on the full training set
        model.fit(X_train_final, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_final)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'cv_accuracy': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'report': classification_report(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        print(f"{name} Test Accuracy: {accuracy:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
    
    # Feature importance for the best model (if available)
    if hasattr(best_model, 'feature_importances_'):
        feature_importances = pd.DataFrame({
            'Feature': X_train_final.columns,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print("\nTop 20 Important Features:")
        print(feature_importances.head(20))
    
    # Create and return a pipeline with all preprocessing steps
    preprocessors = {
        'main_preprocessor': preprocessor,
        'date_extractor': date_extractor,
        'text_extractor': text_extractor,
        'component_extractor': component_extractor
    }
    
    return best_model, preprocessors, results

def predict_with_complaint_model(model, preprocessors, new_data):
    """
    Make predictions using the trained complaint model.
    
    Parameters:
    -----------
    model : trained model
    preprocessors : dict of fitted preprocessors
    new_data : DataFrame with complaint data
    
    Returns:
    --------
    DataFrame with predictions
    """
    # Extract preprocessors
    preprocessor = preprocessors['main_preprocessor']
    date_extractor = preprocessors['date_extractor']
    text_extractor = preprocessors['text_extractor']
    component_extractor = preprocessors['component_extractor']
    
    # Apply preprocessing
    X_preprocessed = preprocessor.transform(new_data)
    
    # Get feature names from preprocessor
    feature_names = []
    for name, trans, cols in preprocessor.transformers_:
        if name != 'remainder' and trans != 'drop':
            if name == 'cat':
                # Get feature names for categorical columns after one-hot encoding
                cat_features = []
                for i, col in enumerate(cols):
                    try:
                        cat_values = trans.named_steps['onehot'].categories_[i]
                        cat_features.extend([f"{col}_{val}" for val in cat_values])
                    except:
                        # Handle case where a column might be empty
                        pass
                feature_names.extend(cat_features)
            else:
                # For numeric and binary columns, keep original names
                feature_names.extend(cols)
    
    # Convert to DataFrame
    X_preprocessed_df = pd.DataFrame(X_preprocessed, index=new_data.index, columns=feature_names)
    
    # Apply other extractors
    X_dates = date_extractor.transform(new_data)
    X_text = text_extractor.transform(new_data)
    X_components = component_extractor.transform(new_data)
    
    # Combine all features
    X_final = pd.concat([X_preprocessed_df, X_dates, X_text, X_components], axis=1)
    
    # Make predictions
    predictions = model.predict(X_final)
    prediction_proba = model.predict_proba(X_final)[:, 1]
    
    # Add predictions to results
    result = new_data.copy()
    result['predicted_recall'] = predictions
    result['recall_probability'] = prediction_proba
    
    return result

import warnings
warnings.filterwarnings("ignore")

# Example usage
if __name__ == "__main__":
    # Load data
    df = pd.read_csv('merged_testing.csv')
    
    # Only use complaint features
    print("Using only complaint features to predict recall status...")
    
    # Train model
    best_model, preprocessors, results = preprocess_and_train_complaint_model(df)
    
    # Print results summary
    for name, result in results.items():
        print(f"\n{name} Results:")
        print(f"Test Accuracy: {result['accuracy']:.4f}")
        print(f"Cross-validation Accuracy: {result['cv_accuracy']:.4f} (±{result['cv_std']:.4f})")
        print("Classification Report:")
        print(result['report'])
        print("Confusion Matrix:")
        print(result['confusion_matrix'])
    
    # Save model for future use
    import joblib
    joblib.dump(best_model, 'complaint_recall_model.pkl')
    joblib.dump(preprocessors, 'complaint_preprocessors.pkl')
    
    print("\nModel saved successfully!")
    
    # Example of making predictions with new data
    print("\nExample of prediction with new complaint data:")
    
    # Take a small sample from original data for demonstration
    new_complaints = df.sample(5)

    #adding vehicle_age
    current_year = datetime.now().year
    new_complaints['vehicle_age'] = current_year - new_complaints['ModelYear']
    
    # Remove target column if present
    if 'recall_status' in new_complaints.columns:
        actual_status = new_complaints['recall_status']
        new_complaints = new_complaints.drop('recall_status', axis=1)
    
    # Make predictions
    predictions = predict_with_complaint_model(best_model, preprocessors, new_complaints)
    
    # Show predictions
    print(predictions[['predicted_recall', 'recall_probability']])

Using only complaint features to predict recall status...
Training RandomForest...




Cross-validation scores: [0.9745935  0.97761953 0.97049847 0.97660224 0.982706  ]
Mean CV accuracy: 0.9764




RandomForest Test Accuracy: 0.9715
Training HistGradientBoosting...




Cross-validation scores: [0.97560976 0.97456765 0.97049847 0.97761953 0.98067141]
Mean CV accuracy: 0.9758




HistGradientBoosting Test Accuracy: 0.9707
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation scores: [0.97357724 0.97558494 0.97151577 0.97761953 0.97863683]
Mean CV accuracy: 0.9754


Parameters: { "use_label_encoder" } are not used.



XGBoost Test Accuracy: 0.9748

Top 20 Important Features:
                            Feature  Importance
486      products_ford productmodel    0.295018
254         summary_complaint_model    0.046571
427                 products_450 sd    0.037845
339          summary_complaint_time    0.032946
428                    products_550    0.025414
168          summary_complaint_door    0.023427
425                    products_450    0.019502
169    summary_complaint_door latch    0.017787
250         summary_complaint_metal    0.017620
244       summary_complaint_manager    0.016575
76             summary_complaint_04    0.013302
235         summary_complaint_local    0.012531
279     summary_complaint_passenger    0.012241
303       summary_complaint_replace    0.012029
57            Model_transit connect    0.011791
183  summary_complaint_experiencing    0.011075
403                   products_2015    0.009180
515                products_mustang    0.008902
286     summary_complaint_pote



      predicted_recall  recall_probability
4135                 1            0.999770
1443                 1            0.993149
236                  1            0.977395
1637                 0            0.000913
2589                 1            0.956420
