In [None]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re

In [2]:
df = pd.read_csv('merged_testing.csv')

In [3]:
df.columns

Index(['odiNumber', 'crash', 'fire', 'numberOfInjuries', 'numberOfDeaths',
       'dateOfIncident', 'dateComplaintFiled', 'incident_filing_lag', 'vin',
       'components', 'summary_complaint', 'products', 'Model', 'ModelYear',
       'MODEL', 'YEAR', 'component_name', 'ODATE', 'CDATE',
       'days_taken_in_investigation', 'subject', 'summary_investigation',
       'similarity_score', 'recall_status'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6145 entries, 0 to 6144
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   odiNumber                    6145 non-null   int64  
 1   crash                        6145 non-null   bool   
 2   fire                         6145 non-null   bool   
 3   numberOfInjuries             6145 non-null   int64  
 4   numberOfDeaths               6145 non-null   int64  
 5   dateOfIncident               5971 non-null   object 
 6   dateComplaintFiled           6145 non-null   object 
 7   incident_filing_lag          5971 non-null   float64
 8   vin                          5450 non-null   object 
 9   components                   6145 non-null   object 
 10  summary_complaint            6145 non-null   object 
 11  products                     6145 non-null   object 
 12  Model                        6145 non-null   object 
 13  ModelYear         

In [None]:
# Data exploration and preprocessing
print(f"Dataset shape: {df.shape}")
print(f"Number of recalls: {df['recall_status'].sum()} out of {len(df)} ({df['recall_status'].mean()*100:.2f}%)")

# Handle dates
date_columns = ['dateOfIncident', 'dateComplaintFiled', 'ODATE', 'CDATE', 'ReportReceivedDate']

# Convert date columns to datetime format
for col in date_columns:
    if col in df.columns:
        # Handle various date formats and missing values
        df[col] = pd.to_datetime(df[col], errors='coerce')

In [None]:
# Create features from dates where available
if 'dateOfIncident' in df.columns and not df['dateOfIncident'].isna().all():
    df['month_of_incident'] = df['dateOfIncident'].dt.month
    df['year_of_incident'] = df['dateOfIncident'].dt.year
    df['season_of_incident'] = df['dateOfIncident'].dt.month.apply(lambda x: 
                                                                 1 if pd.notna(x) and x in [12, 1, 2] else  # Winter
                                                                 2 if pd.notna(x) and x in [3, 4, 5] else   # Spring
                                                                 3 if pd.notna(x) and x in [6, 7, 8] else   # Summer
                                                                 4 if pd.notna(x) and x in [9, 10, 11] else # Fall
                                                                 np.nan)  # Handle NaN

if 'dateComplaintFiled' in df.columns and not df['dateComplaintFiled'].isna().all():
    df['month_of_complaint'] = df['dateComplaintFiled'].dt.month
    df['year_of_complaint'] = df['dateComplaintFiled'].dt.year

# Extract useful features from text columns using TF-IDF
# First, let's clean the text data
text_columns = ['summary_complaint', 'summary_investigation', 'subject', 'components']

In [None]:
# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)  
    text = re.sub(r'\s+', ' ', text)     
    return text.strip()

for col in text_columns:
    if col in df.columns:
        df[f'{col}_cleaned'] = df[col].apply(clean_text)

In [None]:
#adding additional features
# Createing severity score based on injuries and deaths
df['severity_score'] = df['numberOfInjuries'] + df['numberOfDeaths'] * 5  # Deaths weighted more heavily

#calculating vehicle age at incident
current_year = datetime.now().year
if 'year_of_incident' in df.columns and 'ModelYear' in df.columns:
    df['vehicle_age_at_incident'] = df['year_of_incident'] - df['ModelYear']
    # Handle negative values (data errors)
    df.loc[df['vehicle_age_at_incident'] < 0, 'vehicle_age_at_incident'] = np.nan

# Create a feature for reliability of matching between complaint and investigation
# Lower similarity scores might indicate incorrect grouping
df['matching_reliability'] = df['similarity_score'].apply(
    lambda x: 0 if x <= 0.3 else 
              1 if x <= 0.6 else 
              2 if x <= 0.8 else 
              3
)

# Create a lag feature for how quickly a complaint was filed after incident
df['quick_complaint_filing'] = (df['incident_filing_lag'] <= 7).astype(int)  # 1 week threshold

In [None]:
# Define feature sets
numerical_features = ['numberOfInjuries', 'numberOfDeaths', 'similarity_score', 
                     'severity_score', 'ModelYear', 'YEAR']

# Add conditional features only if they exist
if 'incident_filing_lag' in df.columns and not df['incident_filing_lag'].isna().all():
    numerical_features.append('incident_filing_lag')
if 'vehicle_age_at_incident' in df.columns and not df['vehicle_age_at_incident'].isna().all():
    numerical_features.append('vehicle_age_at_incident')
if 'days_taken_in_investigation' in df.columns and not df['days_taken_in_investigation'].isna().all():
    numerical_features.append('days_taken_in_investigation')

categorical_features = ['crash', 'fire', 'component_name', 'matching_reliability', 'quick_complaint_filing']

# Add conditional categorical features only if they exist
if 'Model' in df.columns:
    categorical_features.append('Model')
if 'month_of_incident' in df.columns and not df['month_of_incident'].isna().all():
    categorical_features.append('month_of_incident')
if 'season_of_incident' in df.columns and not df['season_of_incident'].isna().all():
    categorical_features.append('season_of_incident')

# Selected text features to include
text_features = []
for col in ['summary_complaint_cleaned', 'components_cleaned']:
    if col in df.columns:
        text_features.append(col)

# Handle missing numerical values
for col in numerical_features:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

In [None]:
# Focus on essential columns
essential_columns = ['recall_status']
for col in ['summary_complaint_cleaned', 'numberOfInjuries', 'numberOfDeaths', 'crash', 'fire', 'component_name']:
    if col in df.columns:
        essential_columns.append(col)

df_clean = df.dropna(subset=essential_columns, thresh=len(essential_columns)-1)

print(f"Shape after cleaning: {df_clean.shape}")

# Handle class imbalance
recall_count = df_clean['recall_status'].sum()
non_recall_count = len(df_clean) - recall_count
print(f"Class distribution: Recalls={recall_count}, Non-recalls={non_recall_count}")

# If severe imbalance, use class weights
class_weights = None
if recall_count / len(df_clean) < 0.2:  # If recalls are less than 20%
    class_weights = {0: 1, 1: non_recall_count / recall_count}
    print(f"Using class weights due to imbalance: {class_weights}")

In [None]:
# Feature preprocessing setup
preprocessors = []

# Only add transformers for features that exist in the dataframe
if numerical_features:
    numerical_features = [f for f in numerical_features if f in df_clean.columns]
    if numerical_features:
        preprocessors.append(
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numerical_features)
        )

if categorical_features:
    categorical_features = [f for f in categorical_features if f in df_clean.columns]
    if categorical_features:
        preprocessors.append(
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        )

In [None]:
# For text features, we need a different approach to avoid the numpy array error
# We'll create TF-IDF features separately for each text column
for i, text_col in enumerate(text_features):
    if text_col in df_clean.columns:
        # Create a custom function to extract just this column
        def get_text_column(X, column_name=text_col):
            if isinstance(X, pd.DataFrame):
                return X[column_name].fillna('').values
            return X  # If it's already processed

        preprocessors.append(
            (f'text_{i}', Pipeline([
                ('selector', FunctionTransformer(get_text_column, validate=False)),
                ('tfidf', TfidfVectorizer(max_features=500, ngram_range=(1, 2)))
            ]), [text_col])  # We pass column name but it's ignored due to the custom extractor
        )

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=preprocessors,
    remainder='drop'  # This drops columns not specified in the transformers
)

In [None]:
# Split the data
X = df_clean.drop('recall_status', axis=1)
y = df_clean['recall_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
# Build the model pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight=class_weights, 
                                         n_estimators=100, 
                                         random_state=42))
])

In [None]:
# Train the model
print("Training the model...")
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print("\nModel Evaluation:")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")

# Function to get feature importance (simplified)
def get_feature_importance(clf, feature_names=None):
    try:
        rf_classifier = clf.named_steps['classifier']
        importances = rf_classifier.feature_importances_
        
        # If feature names is None, assign indices
        if feature_names is None:
            feature_names = [f"feature_{i}" for i in range(len(importances))]
        
        # Ensure feature_names and importances have same length
        feature_names = feature_names[:len(importances)]
        
        # Sort feature importances
        indices = np.argsort(importances)[::-1]
        
        # Limit to top features for readability
        top_n = min(30, len(feature_names))
        
        # Return sorted importance info
        return [(feature_names[i], importances[i]) for i in indices[:top_n]]
    except Exception as e:
        print(f"Error getting feature importance: {e}")
        return []

try:
    # Get feature importance (simplified version)
    feature_names = clf.named_steps['preprocessor'].get_feature_names_out()
    importance = get_feature_importance(clf, feature_names)
    
    # Print top 10 important features
    print("\nTop 10 Important Features:")
    for i, (feature, importance_value) in enumerate(importance[:10]):
        print(f"{i+1}. {feature}: {importance_value:.4f}")
    
    # Plot feature importance
    if importance:
        plt.figure(figsize=(12, 8))
        features, values = zip(*importance[:20])
        sns.barplot(x=list(values), y=list(features))
        plt.title('Top 20 Feature Importances for Recall Prediction')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
except Exception as e:
    print(f"Could not compute feature importance: {e}")

# Try different models to see if we can improve performance
print("\nTrying different models for comparison...")

# Define the models to try
models = {
    'RandomForest': RandomForestClassifier(class_weight=class_weights, n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Evaluate each model
for name, model in models.items():
    # Create a pipeline with the model
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    model_pipe.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model_pipe.predict(X_test)
    y_prob = model_pipe.predict_proba(X_test)[:, 1]
    
    print(f"\n{name} Performance:")
    print(f"Accuracy: {(y_pred == y_test).mean():.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")

# Function to predict on new data
def predict_recall_probability(new_data, model=clf):

    # Make a copy to avoid modifying the original
    result_df = new_data.copy()
    
    # Make predictions
    try:
        proba = model.predict_proba(new_data)[:, 1]
        result_df['recall_probability'] = proba
        
        # Adding a risk category
        result_df['risk_level'] = pd.cut(
            result_df['recall_probability'], 
            bins=[0, 0.3, 0.7, 1.0], 
            labels=['Low', 'Medium', 'High']
        )
        
    except Exception as e:
        print(f"Error in prediction: {e}")
        result_df['recall_probability'] = None
        result_df['risk_level'] = None
        
    return result_df

Dataset shape: (6145, 37)
Number of recalls: 4056 out of 6145 (66.00%)
Shape after cleaning: (6145, 37)
Class distribution: Recalls=4056, Non-recalls=2089
Training the model...

Model Evaluation:

Confusion Matrix:
[[ 829    7]
 [  26 1596]]

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.99      0.98       836
        True       1.00      0.98      0.99      1622

    accuracy                           0.99      2458
   macro avg       0.98      0.99      0.99      2458
weighted avg       0.99      0.99      0.99      2458


ROC AUC Score: 0.9978
Could not compute feature importance: Estimator selector does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?

Trying different models for comparison...

RandomForest Performance:
Accuracy: 0.9866
ROC AUC: 0.9978

GradientBoosting Performance:
Accuracy: 0.9813
ROC AUC: 0.9940

Model training and evaluation complete!
