In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully.")

## 1. Data Loading

In [None]:
DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'

train = pd.read_csv(DATA_PATH + 'train.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')
sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

## 2. Initial Data Inspection

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
train.describe()

### 2.1 Missing Values Analysis

In [None]:
def analyze_missing_values(df, name='Dataset'):
    missing = pd.DataFrame({
        'Column': df.columns,
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
    }).sort_values('Missing_Count', ascending=False)
    
    missing = missing[missing['Missing_Count'] > 0]
    
    if len(missing) > 0:
        print(f"\n{name} - Missing Values:")
        print(missing.to_string(index=False))
        
        plt.figure(figsize=(12, 6))
        plt.barh(missing['Column'], missing['Missing_Percentage'])
        plt.xlabel('Missing Percentage (%)')
        plt.title(f'{name} - Missing Values Distribution')
        plt.tight_layout()
        plt.show()
    else:
        print(f"\n{name} - No missing values detected.")
    
    return missing

missing_train = analyze_missing_values(train, 'Training Set')
missing_test = analyze_missing_values(test, 'Test Set')

## 3. Target Variable Analysis

In [None]:
target_col = 'Trip_Label'

if target_col in train.columns:
    target_counts = train[target_col].value_counts()
    target_pct = train[target_col].value_counts(normalize=True) * 100
    
    target_summary = pd.DataFrame({
        'Count': target_counts,
        'Percentage': target_pct.round(2)
    })
    
    print("\nTarget Variable Distribution:")
    print(target_summary)
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    target_counts.plot(kind='bar', ax=axes[0], color='steelblue')
    axes[0].set_title('Trip Label Distribution (Count)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Trip Label')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    axes[1].pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Trip Label Distribution (Percentage)', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    imbalance_ratio = target_counts.max() / target_counts.min()
    print(f"\nClass Imbalance Ratio: {imbalance_ratio:.2f}:1")
else:
    print(f"\nTarget column '{target_col}' not found in training data.")

## 4. Spatio-Temporal Features Analysis

In [None]:
spatial_cols = ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long', 
                'Pickup_Zone', 'Dropoff_Zone', 'Distance_KM']

print("Spatio-Temporal Features Summary:")
for col in spatial_cols:
    if col in train.columns:
        if train[col].dtype in ['float64', 'int64']:
            print(f"\n{col}:")
            print(f"  Min: {train[col].min()}")
            print(f"  Max: {train[col].max()}")
            print(f"  Mean: {train[col].mean():.2f}")
            print(f"  Median: {train[col].median():.2f}")
        else:
            print(f"\n{col}: {train[col].nunique()} unique values")

### 4.1 Timestamp Analysis

In [None]:
if 'Timestamp' in train.columns:
    train['Timestamp_parsed'] = pd.to_datetime(train['Timestamp'])
    train['Hour'] = train['Timestamp_parsed'].dt.hour
    train['DayOfWeek'] = train['Timestamp_parsed'].dt.dayofweek
    train['DayName'] = train['Timestamp_parsed'].dt.day_name()
    train['Month'] = train['Timestamp_parsed'].dt.month
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))
    
    train['Hour'].value_counts().sort_index().plot(kind='bar', ax=axes[0, 0], color='coral')
    axes[0, 0].set_title('Trip Distribution by Hour of Day', fontweight='bold')
    axes[0, 0].set_xlabel('Hour')
    axes[0, 0].set_ylabel('Count')
    
    train['DayName'].value_counts()[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 
                                      'Friday', 'Saturday', 'Sunday']].plot(kind='bar', 
                                      ax=axes[0, 1], color='skyblue')
    axes[0, 1].set_title('Trip Distribution by Day of Week', fontweight='bold')
    axes[0, 1].set_xlabel('Day')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    if target_col in train.columns:
        hour_label = pd.crosstab(train['Hour'], train[target_col], normalize='index') * 100
        hour_label.plot(kind='bar', stacked=True, ax=axes[1, 0])
        axes[1, 0].set_title('Trip Label Distribution by Hour (%)', fontweight='bold')
        axes[1, 0].set_xlabel('Hour')
        axes[1, 0].set_ylabel('Percentage')
        axes[1, 0].legend(title='Trip Label', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        day_label = pd.crosstab(train['DayOfWeek'], train[target_col], normalize='index') * 100
        day_label.plot(kind='bar', stacked=True, ax=axes[1, 1])
        axes[1, 1].set_title('Trip Label Distribution by Day of Week (%)', fontweight='bold')
        axes[1, 1].set_xlabel('Day of Week')
        axes[1, 1].set_ylabel('Percentage')
        axes[1, 1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=45)
        axes[1, 1].legend(title='Trip Label', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

### 4.2 Distance and Location Analysis

In [None]:
if 'Distance_KM' in train.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    train['Distance_KM'].hist(bins=50, ax=axes[0], edgecolor='black')
    axes[0].set_title('Distance Distribution', fontweight='bold')
    axes[0].set_xlabel('Distance (KM)')
    axes[0].set_ylabel('Frequency')
    
    if target_col in train.columns:
        train.boxplot(column='Distance_KM', by=target_col, ax=axes[1])
        axes[1].set_title('Distance Distribution by Trip Label', fontweight='bold')
        axes[1].set_xlabel('Trip Label')
        axes[1].set_ylabel('Distance (KM)')
        plt.suptitle('')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nDistance Statistics:")
    print(f"Mean: {train['Distance_KM'].mean():.2f} KM")
    print(f"Median: {train['Distance_KM'].median():.2f} KM")
    print(f"Min: {train['Distance_KM'].min():.2f} KM")
    print(f"Max: {train['Distance_KM'].max():.2f} KM")

In [None]:
if all(col in train.columns for col in ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']):
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371
        lat1_rad = np.radians(lat1)
        lat2_rad = np.radians(lat2)
        delta_lat = np.radians(lat2 - lat1)
        delta_lon = np.radians(lon2 - lon1)
        
        a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        
        return R * c
    
    train['Haversine_Distance'] = haversine_distance(
        train['Pickup_Lat'], train['Pickup_Long'],
        train['Dropoff_Lat'], train['Dropoff_Long']
    )
    
    if 'Distance_KM' in train.columns:
        train['Distance_Ratio'] = train['Distance_KM'] / (train['Haversine_Distance'] + 1e-6)
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        axes[0].scatter(train['Haversine_Distance'], train['Distance_KM'], alpha=0.3)
        axes[0].plot([0, train['Distance_KM'].max()], [0, train['Distance_KM'].max()], 
                     'r--', label='Perfect Match')
        axes[0].set_xlabel('Haversine Distance (KM)')
        axes[0].set_ylabel('Actual Distance (KM)')
        axes[0].set_title('Haversine vs Actual Distance', fontweight='bold')
        axes[0].legend()
        
        train['Distance_Ratio'].hist(bins=50, ax=axes[1], edgecolor='black')
        axes[1].set_title('Distance Ratio Distribution', fontweight='bold')
        axes[1].set_xlabel('Actual Distance / Haversine Distance')
        axes[1].set_ylabel('Frequency')
        axes[1].axvline(x=1, color='r', linestyle='--', label='Ratio = 1')
        axes[1].legend()
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nDistance Ratio Statistics:")
        print(f"Mean: {train['Distance_Ratio'].mean():.2f}")
        print(f"Median: {train['Distance_Ratio'].median():.2f}")

### 4.3 Zone Analysis

In [None]:
if 'Pickup_Zone' in train.columns and 'Dropoff_Zone' in train.columns:
    print(f"\nNumber of unique Pickup Zones: {train['Pickup_Zone'].nunique()}")
    print(f"Number of unique Dropoff Zones: {train['Dropoff_Zone'].nunique()}")
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    top_pickup = train['Pickup_Zone'].value_counts().head(15)
    top_pickup.plot(kind='barh', ax=axes[0], color='lightgreen')
    axes[0].set_title('Top 15 Pickup Zones', fontweight='bold')
    axes[0].set_xlabel('Count')
    axes[0].invert_yaxis()
    
    top_dropoff = train['Dropoff_Zone'].value_counts().head(15)
    top_dropoff.plot(kind='barh', ax=axes[1], color='lightcoral')
    axes[1].set_title('Top 15 Dropoff Zones', fontweight='bold')
    axes[1].set_xlabel('Count')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.show()

## 5. Telematics (Sensor) Data Analysis

In [None]:
sensor_cols = ['Accel_X', 'Accel_Y', 'Accel_Z', 'Gyro_Z', 'GPS_Accuracy_M']

if all(col in train.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
    train['Accel_Magnitude'] = np.sqrt(train['Accel_X']**2 + train['Accel_Y']**2 + train['Accel_Z']**2)
    sensor_cols.append('Accel_Magnitude')

available_sensors = [col for col in sensor_cols if col in train.columns]

if available_sensors:
    print("Sensor Data Summary:")
    print(train[available_sensors].describe())
    
    n_cols = 3
    n_rows = (len(available_sensors) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(available_sensors):
        train[col].hist(bins=50, ax=axes[idx], edgecolor='black')
        axes[idx].set_title(f'{col} Distribution', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
    
    for idx in range(len(available_sensors), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

### 5.1 Sensor Data by Trip Label

In [None]:
if target_col in train.columns and available_sensors:
    n_cols = 2
    n_rows = (len(available_sensors) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 5))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(available_sensors):
        train.boxplot(column=col, by=target_col, ax=axes[idx])
        axes[idx].set_title(f'{col} by Trip Label', fontweight='bold')
        axes[idx].set_xlabel('Trip Label')
        axes[idx].set_ylabel(col)
        axes[idx].tick_params(axis='x', rotation=45)
        plt.suptitle('')
    
    for idx in range(len(available_sensors), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 6. Transaction and Economics Analysis

In [None]:
transaction_cols = ['Est_Price_IDR', 'Surge_Multiplier', 'Promo_Code', 'Payment_Method']

available_transaction = [col for col in transaction_cols if col in train.columns]

if available_transaction:
    print("Transaction Features Summary:")
    for col in available_transaction:
        if train[col].dtype in ['float64', 'int64']:
            print(f"\n{col}:")
            print(f"  Mean: {train[col].mean():.2f}")
            print(f"  Median: {train[col].median():.2f}")
            print(f"  Min: {train[col].min():.2f}")
            print(f"  Max: {train[col].max():.2f}")
        else:
            print(f"\n{col}: {train[col].nunique()} unique values")
            print(train[col].value_counts().head())

In [None]:
if 'Est_Price_IDR' in train.columns and 'Distance_KM' in train.columns:
    train['Price_per_KM'] = train['Est_Price_IDR'] / (train['Distance_KM'] + 1e-6)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    train['Est_Price_IDR'].hist(bins=50, ax=axes[0, 0], edgecolor='black')
    axes[0, 0].set_title('Price Distribution', fontweight='bold')
    axes[0, 0].set_xlabel('Price (IDR)')
    axes[0, 0].set_ylabel('Frequency')
    
    axes[0, 1].scatter(train['Distance_KM'], train['Est_Price_IDR'], alpha=0.3)
    axes[0, 1].set_title('Price vs Distance', fontweight='bold')
    axes[0, 1].set_xlabel('Distance (KM)')
    axes[0, 1].set_ylabel('Price (IDR)')
    
    train['Price_per_KM'].hist(bins=50, ax=axes[1, 0], edgecolor='black')
    axes[1, 0].set_title('Price per KM Distribution', fontweight='bold')
    axes[1, 0].set_xlabel('Price per KM (IDR)')
    axes[1, 0].set_ylabel('Frequency')
    
    if target_col in train.columns:
        train.boxplot(column='Price_per_KM', by=target_col, ax=axes[1, 1])
        axes[1, 1].set_title('Price per KM by Trip Label', fontweight='bold')
        axes[1, 1].set_xlabel('Trip Label')
        axes[1, 1].set_ylabel('Price per KM (IDR)')
        axes[1, 1].tick_params(axis='x', rotation=45)
        plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

In [None]:
if 'Surge_Multiplier' in train.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    train['Surge_Multiplier'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='orange')
    axes[0].set_title('Surge Multiplier Distribution', fontweight='bold')
    axes[0].set_xlabel('Surge Multiplier')
    axes[0].set_ylabel('Count')
    
    if target_col in train.columns:
        surge_label = pd.crosstab(train['Surge_Multiplier'], train[target_col], normalize='index') * 100
        surge_label.plot(kind='bar', stacked=True, ax=axes[1])
        axes[1].set_title('Trip Label Distribution by Surge Multiplier (%)', fontweight='bold')
        axes[1].set_xlabel('Surge Multiplier')
        axes[1].set_ylabel('Percentage')
        axes[1].legend(title='Trip Label', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

In [None]:
categorical_transaction = ['Promo_Code', 'Payment_Method']

for col in categorical_transaction:
    if col in train.columns:
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        train[col].value_counts().plot(kind='bar', ax=axes[0], color='teal')
        axes[0].set_title(f'{col} Distribution', fontweight='bold')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Count')
        axes[0].tick_params(axis='x', rotation=45)
        
        if target_col in train.columns:
            cat_label = pd.crosstab(train[col], train[target_col], normalize='index') * 100
            cat_label.plot(kind='bar', stacked=True, ax=axes[1])
            axes[1].set_title(f'Trip Label Distribution by {col} (%)', fontweight='bold')
            axes[1].set_xlabel(col)
            axes[1].set_ylabel('Percentage')
            axes[1].tick_params(axis='x', rotation=45)
            axes[1].legend(title='Trip Label', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        plt.tight_layout()
        plt.show()

## 7. Device and Environment Features

In [None]:
device_env_cols = ['Device_FP', 'Car_Model', 'Weather', 'Traffic', 'Battery_Level', 'Signal_Strength']

available_device_env = [col for col in device_env_cols if col in train.columns]

if available_device_env:
    print("Device and Environment Features Summary:")
    for col in available_device_env:
        if train[col].dtype in ['float64', 'int64']:
            print(f"\n{col}:")
            print(f"  Mean: {train[col].mean():.2f}")
            print(f"  Median: {train[col].median():.2f}")
            print(f"  Min: {train[col].min():.2f}")
            print(f"  Max: {train[col].max():.2f}")
        else:
            print(f"\n{col}: {train[col].nunique()} unique values")
            if train[col].nunique() < 20:
                print(train[col].value_counts())

In [None]:
categorical_device = ['Weather', 'Traffic', 'Signal_Strength', 'Car_Model']

for col in categorical_device:
    if col in train.columns and train[col].nunique() < 50:
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        top_values = train[col].value_counts().head(15)
        top_values.plot(kind='barh', ax=axes[0], color='purple')
        axes[0].set_title(f'{col} Distribution (Top 15)', fontweight='bold')
        axes[0].set_xlabel('Count')
        axes[0].invert_yaxis()
        
        if target_col in train.columns:
            top_cats = train[col].value_counts().head(10).index
            filtered_data = train[train[col].isin(top_cats)]
            cat_label = pd.crosstab(filtered_data[col], filtered_data[target_col], normalize='index') * 100
            cat_label.plot(kind='barh', stacked=True, ax=axes[1])
            axes[1].set_title(f'Trip Label Distribution by {col} (%) - Top 10', fontweight='bold')
            axes[1].set_xlabel('Percentage')
            axes[1].invert_yaxis()
            axes[1].legend(title='Trip Label', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        plt.tight_layout()
        plt.show()

In [None]:
if 'Battery_Level' in train.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    train['Battery_Level'].hist(bins=30, ax=axes[0], edgecolor='black', color='green')
    axes[0].set_title('Battery Level Distribution', fontweight='bold')
    axes[0].set_xlabel('Battery Level (%)')
    axes[0].set_ylabel('Frequency')
    
    if target_col in train.columns:
        train.boxplot(column='Battery_Level', by=target_col, ax=axes[1])
        axes[1].set_title('Battery Level by Trip Label', fontweight='bold')
        axes[1].set_xlabel('Trip Label')
        axes[1].set_ylabel('Battery Level (%)')
        axes[1].tick_params(axis='x', rotation=45)
        plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

## 8. Correlation Analysis

In [None]:
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()

if 'Trip_ID' in numeric_cols:
    numeric_cols.remove('Trip_ID')
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

if numeric_cols:
    correlation_matrix = train[numeric_cols].corr()
    
    plt.figure(figsize=(14, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1)
    plt.title('Correlation Matrix of Numeric Features', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\nHighly Correlated Feature Pairs (|correlation| > 0.7):")
    high_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.7:
                high_corr.append([
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ])
    
    if high_corr:
        high_corr_df = pd.DataFrame(high_corr, columns=['Feature 1', 'Feature 2', 'Correlation'])
        print(high_corr_df.to_string(index=False))
    else:
        print("No highly correlated feature pairs found.")

## 9. Feature Importance (Initial Assessment)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

if target_col in train.columns:
    analysis_df = train.copy()
    
    label_encoders = {}
    categorical_cols = analysis_df.select_dtypes(include=['object']).columns.tolist()
    
    if 'Trip_ID' in categorical_cols:
        categorical_cols.remove('Trip_ID')
    if target_col in categorical_cols:
        categorical_cols.remove(target_col)
    
    for col in categorical_cols:
        le = LabelEncoder()
        analysis_df[col] = le.fit_transform(analysis_df[col].astype(str))
        label_encoders[col] = le
    
    feature_cols = [col for col in analysis_df.columns if col not in ['Trip_ID', target_col, 
                                                                       'Timestamp', 'Timestamp_parsed',
                                                                       'DayName']]
    
    X = analysis_df[feature_cols].fillna(-999)
    y = analysis_df[target_col]
    
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
    rf_model.fit(X, y_encoded)
    
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 20 Most Important Features (Random Forest):")
    print(feature_importance.head(20).to_string(index=False))
    
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Feature Importances (Random Forest)', fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 10. Outlier Detection

In [None]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers), (len(outliers) / len(df)) * 100

numeric_features = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
if 'Trip_ID' in numeric_features:
    numeric_features.remove('Trip_ID')

outlier_summary = []
for col in numeric_features:
    count, pct = detect_outliers_iqr(train, col)
    outlier_summary.append({
        'Feature': col,
        'Outlier_Count': count,
        'Outlier_Percentage': round(pct, 2)
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Outlier_Percentage', ascending=False)

print("\nOutlier Analysis (IQR Method):")
print(outlier_df.head(15).to_string(index=False))

plt.figure(figsize=(12, 6))
top_outliers = outlier_df.head(15)
plt.barh(range(len(top_outliers)), top_outliers['Outlier_Percentage'])
plt.yticks(range(len(top_outliers)), top_outliers['Feature'])
plt.xlabel('Outlier Percentage (%)')
plt.title('Top 15 Features with Outliers', fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 11. Data Quality Summary

In [None]:
print("="*80)
print("DATA QUALITY SUMMARY")
print("="*80)

print(f"\n1. Dataset Size:")
print(f"   Training samples: {len(train):,}")
print(f"   Test samples: {len(test):,}")
print(f"   Total features: {train.shape[1]}")

if target_col in train.columns:
    print(f"\n2. Target Distribution:")
    for label, count in train[target_col].value_counts().items():
        pct = (count / len(train)) * 100
        print(f"   {label}: {count:,} ({pct:.2f}%)")

print(f"\n3. Missing Values:")
if len(missing_train) > 0:
    print(f"   Features with missing values: {len(missing_train)}")
    print(f"   Total missing cells: {missing_train['Missing_Count'].sum():,}")
else:
    print("   No missing values in training set")

print(f"\n4. Data Types:")
print(f"   Numeric features: {len(train.select_dtypes(include=['float64', 'int64']).columns)}")
print(f"   Categorical features: {len(train.select_dtypes(include=['object']).columns)}")

print(f"\n5. Cardinality:")
high_card = [col for col in train.select_dtypes(include=['object']).columns 
             if train[col].nunique() > 50]
if high_card:
    print(f"   High cardinality features (>50 unique): {', '.join(high_card)}")
else:
    print("   No high cardinality features detected")

print("\n" + "="*80)

## 12. Key Findings and Recommendations

**Key Insights from EDA:**

1. **Class Imbalance**: Review target distribution to determine if class weighting or sampling techniques are needed
2. **Feature Engineering Opportunities**:
   - Haversine distance vs actual distance ratio (Navigation issues detection)
   - Price per kilometer (Fraud detection)
   - Accelerometer magnitude and extremes (Safety violations)
   - Temporal features (hour, day of week, rush hour indicators)
   - GPS accuracy thresholds (Navigation issues)

3. **Data Quality Considerations**:
   - Check for missing values and decide on imputation strategy
   - Identify and handle outliers appropriately
   - High cardinality features may need encoding strategies

4. **Modeling Strategy**:
   - Use Stratified K-Fold for cross-validation
   - Consider class weights in tree-based models
   - Gradient Boosted Trees (CatBoost, LightGBM, XGBoost) recommended
   - Threshold optimization for Macro F1-Score

5. **Feature Selection**:
   - Focus on top features identified by Random Forest
   - Consider feature interactions (e.g., surge multiplier with time features)
   - Sensor extremes (max, min, variance) for safety violations