In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from xgboost import plot_importance

In [7]:
def clean_preprocess_data(df):
    """
    Cleans and preprocesses the data.
    """
    # Handle missing values
    df['engine_temperature'] = df['engine_temperature'].fillna(df['engine_temperature'].mean())
    df['tire_pressure'] = df['tire_pressure'].fillna(df['tire_pressure'].mean())
    df['engine_rpm'] = df['engine_rpm'].fillna(df['engine_rpm'].median())
    df['vehicle_speed'] = df['vehicle_speed'].fillna(df['vehicle_speed'].mean())

    # Normalize numerical features
    cols_to_normalize = ['engine_temperature', 'tire_pressure', 'engine_rpm', 'vehicle_speed', 'mileage']
    scaler = MinMaxScaler()
    df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

    # Feature engineering
    # This is not required as it is redundant data
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    df['engine_temp_speed'] = df['engine_temperature'] * df['vehicle_speed']
    print("Data transformation successful!")
    return df


In [8]:
def generate_maintenance_analysis(data):
    """
    Generates maintenance analysis by aggregating features based on the maintenance_required column.
    
    Parameters:
    data (DataFrame): Input dataset containing features and the maintenance_required column.
    
    Returns:
    DataFrame: Aggregated maintenance analysis dataset.
    """
    # Aggregated statistics by maintenance_required
    analysis = data.groupby('maintenance_required').agg(
        count=('maintenance_required', 'size'),
        average_mileage=('mileage', 'mean'),
        average_engine_temperature=('engine_temperature', 'mean'),
        average_tire_pressure=('tire_pressure', 'mean')
    ).reset_index()

    # Calculate proportions
    total_vehicles = analysis['count'].sum()
    analysis['proportion (%)'] = (analysis['count'] / total_vehicles) * 100

    return analysis

In [9]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate Logistic Regression, Random Forest, and XGBoost.
    """
    models = {
        "Logistic Regression": LogisticRegression(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
        "XGBoost": XGBClassifier(random_state=42, n_estimators=100, use_label_encoder=False)
    }

    results = []

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "ROC-AUC": roc_auc
        })

    return pd.DataFrame(results)


def plot_results(results):
    """
    Plot the performance metrics of the models.
    """
    fig, ax = plt.subplots(1, 3, figsize=(18, 6))

    # Accuracy Plot
    ax[0].bar(results['Model'], results['Accuracy'])
    ax[0].set_title("Accuracy Comparison")
    ax[0].set_ylabel("Accuracy")

    # F1 Score Plot
    ax[1].bar(results['Model'], results['F1 Score'])
    ax[1].set_title("F1 Score Comparison")
    ax[1].set_ylabel("F1 Score")

    # ROC-AUC Plot
    ax[2].bar(results['Model'], results['ROC-AUC'])
    ax[2].set_title("ROC-AUC Comparison")
    ax[2].set_ylabel("ROC-AUC")

    plt.tight_layout()
    plt.show()

In [10]:
def train_xgboost_model(X_test, X_train,y_test, y_train):
    # Train XGBoost model
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train, y_train)

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    plot_importance(xgb_model, importance_type='weight')  # Use 'gain' or 'cover' if preferred
    plt.title('XGBoost Feature Importance')
    plt.show()
    
     # Make predictions
    y_pred = xgb_model.predict(X_test)
    y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]

    # Combine data for saving
    results_df = X_test.copy()
    results_df['actual'] = y_test.values
    results_df['predicted'] = y_pred
    results_df['predicted_probability'] = y_pred_prob
    return results_df