In [1]:
# Install required packages if not already installed
%pip install opendatasets pandas numpy legacy-cgi

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: C:\Users\cbotc\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
#TroTro Multi-City Dataset Cleaning 
import opendatasets as od
import pandas as pd
import numpy as np
import os
import warnings
from datetime import datetime
import re
import shutil
from pathlib import Path

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("=== TroTro Multi-City Dataset Cleaning and Analysis ===")
print(f"Starting analysis at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# 1. Download dataset with error handling
def download_dataset():
    """Download the TroTro dataset from Kaggle"""
    try:
        dataset_url = 'https://www.kaggle.com/datasets/godfredaddaiamoako/trotro'
        print("Downloading dataset...")
        od.download(dataset_url)
        print("✓ Dataset downloaded successfully")
    except Exception as e:
        print(f"⚠ Error downloading dataset: {e}")
        print("Please ensure you have Kaggle credentials configured")
        return False
    return True

# 2. Enhanced data cleaning function
def clean_data(df, filename="", city=""):
    """Comprehensive data cleaning function"""
    print(f"\nCleaning data for {city}/{filename}...")
    original_shape = df.shape
    
    if df.empty:
        print(f"  ⚠ Warning: {filename} is empty, skipping...")
        return None
    
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # 1. Remove completely empty rows and columns
    df_clean = df_clean.dropna(how='all').dropna(axis=1, how='all')
    
    # 2. Clean column names
    df_clean.columns = df_clean.columns.str.strip().str.lower()
    df_clean.columns = df_clean.columns.str.replace(' ', '_').str.replace(r'[^\w]', '_', regex=True)
    
    # 3. Handle duplicates
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    duplicates_removed = initial_rows - len(df_clean)
    if duplicates_removed > 0:
        print(f"  - Removed {duplicates_removed} duplicate rows")
    
    # 4. Clean text columns
    text_columns = df_clean.select_dtypes(include=['object']).columns
    for col in text_columns:
        if col in df_clean.columns:
            # Strip whitespace and handle common issues
            df_clean[col] = df_clean[col].astype(str).str.strip()
            df_clean[col] = df_clean[col].replace(['nan', 'NaN', 'None', ''], np.nan)
            
            # Clean special characters and normalize text
            df_clean[col] = df_clean[col].str.replace(r'\s+', ' ', regex=True)
            
    # 5. Handle missing values intelligently
    missing_threshold = 0.7  # Drop columns with >70% missing data
    columns_to_drop = []
    for col in df_clean.columns:
        missing_pct = df_clean[col].isnull().sum() / len(df_clean)
        if missing_pct > missing_threshold:
            columns_to_drop.append(col)
            print(f"  - Dropped column '{col}' (>{missing_threshold*100}% missing)")
    
    if columns_to_drop:
        df_clean = df_clean.drop(columns=columns_to_drop)
    
    # 6. Fill remaining missing values based on data type
    for col in df_clean.columns:
        if df_clean[col].isnull().any():
            if df_clean[col].dtype in ['int64', 'float64']:
                # For numeric columns, use median
                df_clean[col] = df_clean[col].fillna(df_clean[col].median())
            else:
                # For categorical columns, use mode or 'Unknown'
                mode_val = df_clean[col].mode()
                if len(mode_val) > 0:
                    df_clean[col] = df_clean[col].fillna(mode_val[0])
                else:
                    df_clean[col] = df_clean[col].fillna('Unknown')
    
    # 7. Detect and handle potential date columns
    potential_date_cols = [col for col in df_clean.columns if 'date' in col.lower() or 'time' in col.lower()]
    for col in potential_date_cols:
        try:
            df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
            print(f"  - Converted '{col}' to datetime")
        except:
            pass
    
    # 8. Clean numeric columns
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if len(df_clean[col].dropna()) > 0:  # Only if we have numeric data
            # Remove outliers using IQR method
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            
            if IQR > 0:  # Only apply if there's variation in the data
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                outliers_before = len(df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)])
                df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
                if outliers_before > 0:
                    print(f"  - Capped {outliers_before} outliers in '{col}'")
    
    # 9. Add city identifier
    df_clean['city'] = city
    df_clean['source_file'] = filename
    
    print(f"  ✓ Cleaned data shape: {original_shape} → {df_clean.shape}")
    return df_clean

# 3. Data analysis function
def analyze_data(df, filename="", city=""):
    """Perform comprehensive data analysis"""
    print(f"\n--- Analysis for {city}/{filename} ---")
    if df is None or df.empty:
        print("No data to analyze")
        return df
        
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values analysis
    missing_data = df.isnull().sum()
    if missing_data.sum() > 0:
        print("\nMissing values:")
        print(missing_data[missing_data > 0])
    else:
        print("\n✓ No missing values!")
    
    # Data types
    print(f"\nData types:")
    print(df.dtypes.value_counts())
    
    # Numeric columns summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nNumeric columns summary:")
        print(df[numeric_cols].describe())
    
    # Categorical columns info
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\nCategorical columns (top 3 each):")
        for col in categorical_cols[:3]:  # Limit to first 3 to avoid clutter
            print(f"\n{col}:")
            print(df[col].value_counts().head(3))
    
    # Display sample
    print(f"\nSample data (first 3 rows):")
    print(df.head(3))
    
    return df

# 4. Load data with multiple separator attempts
def load_data_file(file_path):
    """Try to load a data file with different separators"""
    file_extension = os.path.splitext(file_path)[1].lower()
    
    separators_to_try = [',', '\t', ';', '|', r'\s+']
    
    for sep in separators_to_try:
        try:
            if sep == r'\s+':
                df = pd.read_csv(file_path, sep=sep, engine='python')
            else:
                df = pd.read_csv(file_path, sep=sep)
            
            # Check if data loaded properly (more than just headers)
            if not df.empty and len(df.columns) > 1:
                print(f"  ✓ Loaded with separator '{sep}': {df.shape}")
                return df
        except Exception as e:
            continue
    
    print(f"  ❌ Could not load {file_path} with any separator")
    return None

# 5. Create output directories
def create_output_structure():
    """Create clean directory structure"""
    # Create main output directory
    output_dir = Path('cleaned_trotro_data')
    output_dir.mkdir(exist_ok=True)
    
    # Create subdirectories
    (output_dir / 'by_city').mkdir(exist_ok=True)
    (output_dir / 'combined').mkdir(exist_ok=True)
    (output_dir / 'reports').mkdir(exist_ok=True)
    
    return output_dir

# 6. Main execution function
def main():
    """Main execution function for multi-city cleaning"""
    
    # Download dataset
    if not download_dataset():
        print("Failed to download dataset. Please check your Kaggle credentials.")
        return
    
    # Find data directory structure
    base_data_dir = Path('trotro')
    if not base_data_dir.exists():
        # Try alternative directory names
        possible_dirs = [d for d in Path('.').iterdir() if d.is_dir() and 'trotro' in d.name.lower()]
        if possible_dirs:
            base_data_dir = possible_dirs[0]
        else:
            print("Could not find dataset directory")
            return
    
    # Look for the nested structure
    datasets_dir = base_data_dir / 'trotrolive-datasets'
    if not datasets_dir.exists():
        datasets_dir = base_data_dir
    
    print(f"Using data directory: {datasets_dir}")
    
    # Create output structure
    output_dir = create_output_structure()
    
    # Find all city directories
    city_dirs = [d for d in datasets_dir.iterdir() if d.is_dir()]
    print(f"Found {len(city_dirs)} city directories: {[d.name for d in city_dirs]}")
    
    if not city_dirs:
        print("No city directories found")
        return
    
    # Storage for all cleaned data
    all_city_data = {}
    all_files_data = []
    
    # Process each city
    for city_dir in city_dirs:
        city_name = city_dir.name
        print(f"\n{'='*60}")
        print(f"PROCESSING CITY: {city_name.upper()}")
        print(f"{'='*60}")
        
        # Find data files in city directory
        data_files = []
        for ext in ['*.txt', '*.csv']:
            data_files.extend(city_dir.glob(ext))
        
        if not data_files:
            print(f"No data files found in {city_name}")
            continue
            
        print(f"Found {len(data_files)} data files in {city_name}: {[f.name for f in data_files]}")
        
        # Storage for this city's data
        city_cleaned_data = {}
        
        # Process each file in the city
        for data_file in data_files:
            try:
                print(f"\n{'-'*40}")
                print(f"Processing: {city_name}/{data_file.name}")
                print(f"{'-'*40}")
                
                # Load data
                df = load_data_file(data_file)
                if df is None:
                    continue
                
                # Clean data
                df_clean = clean_data(df, data_file.name, city_name)
                if df_clean is None:
                    continue
                
                # Analyze cleaned data
                df_analyzed = analyze_data(df_clean, data_file.name, city_name)
                
                # Save individual cleaned file
                city_output_dir = output_dir / 'by_city' / city_name
                city_output_dir.mkdir(exist_ok=True)
                
                cleaned_filename = city_output_dir / f'cleaned_{data_file.stem}.csv'
                df_clean.to_csv(cleaned_filename, index=False)
                print(f"✓ Saved to: {cleaned_filename}")
                
                # Store for city-level combination
                city_cleaned_data[data_file.name] = df_clean
                
                # Store for global combination
                all_files_data.append(df_clean)
                
            except Exception as e:
                print(f"❌ Error processing {city_name}/{data_file.name}: {e}")
                continue
        
        # Combine all files for this city
        if city_cleaned_data:
            print(f"\n{'-'*40}")
            print(f"COMBINING DATA FOR {city_name.upper()}")
            print(f"{'-'*40}")
            
            try:
                # Find common columns across all files in this city
                all_columns = [set(df.columns) for df in city_cleaned_data.values()]
                common_columns = set.intersection(*all_columns) if all_columns else set()
                
                if len(common_columns) > 2:  # More than just city and source_file
                    print(f"Found {len(common_columns)} common columns for {city_name}")
                    
                    # Combine city data
                    city_combined = pd.concat([df[list(common_columns)] for df in city_cleaned_data.values()], 
                                            ignore_index=True)
                    
                    # Save city combined data
                    city_combined_file = output_dir / 'by_city' / f'{city_name}_combined.csv'
                    city_combined.to_csv(city_combined_file, index=False)
                    print(f"✓ City combined data saved: {city_combined_file}")
                    
                    # Store for global analysis
                    all_city_data[city_name] = city_combined
                    
                else:
                    print(f"⚠ Too few common columns to combine {city_name} data")
                    
            except Exception as e:
                print(f"❌ Error combining {city_name} data: {e}")
    
    # Create global combined dataset
    if all_files_data:
        print(f"\n{'='*60}")
        print("CREATING GLOBAL COMBINED DATASET")
        print(f"{'='*60}")
        
        try:
            # Find common columns across ALL files from ALL cities
            all_columns = [set(df.columns) for df in all_files_data]
            global_common_columns = set.intersection(*all_columns) if all_columns else set()
            
            if len(global_common_columns) > 2:
                print(f"Found {len(global_common_columns)} common columns across all cities")
                
                # Create global combined dataset
                global_combined = pd.concat([df[list(global_common_columns)] for df in all_files_data], 
                                          ignore_index=True)
                
                # Save global combined data
                global_combined_file = output_dir / 'combined' / 'all_cities_combined.csv'
                global_combined.to_csv(global_combined_file, index=False)
                print(f"✓ Global combined data saved: {global_combined_file}")
                
                # Generate summary report
                summary_report = f"""
TroTro Dataset Cleaning Summary Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Cities Processed: {len(all_city_data)}
Total Files Processed: {len(all_files_data)}
Global Combined Dataset Shape: {global_combined.shape}

City Breakdown:
"""
                for city, data in all_city_data.items():
                    summary_report += f"- {city}: {data.shape[0]} records, {data.shape[1]} columns\n"
                
                summary_report += f"\nCommon Columns Across All Data:\n"
                for col in sorted(global_common_columns):
                    summary_report += f"- {col}\n"
                
                # Save summary report
                report_file = output_dir / 'reports' / 'cleaning_summary.txt'
                with open(report_file, 'w') as f:
                    f.write(summary_report)
                
                print(f"✓ Summary report saved: {report_file}")
                
            else:
                print("⚠ Too few common columns to create global combined dataset")
                
        except Exception as e:
            print(f"❌ Error creating global combined dataset: {e}")
    
    # Final summary
    print(f"\n{'='*60}")
    print("CLEANING COMPLETE!")
    print(f"{'='*60}")
    print(f"Output directory: {output_dir}")
    print(f"- Individual city files: {output_dir / 'by_city'}")
    print(f"- Combined datasets: {output_dir / 'combined'}")
    print(f"- Reports: {output_dir / 'reports'}")
    print(f"Finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Execute the main function
if __name__ == "__main__":
    main()

=== TroTro Multi-City Dataset Cleaning and Analysis ===
Starting analysis at: 2025-08-11 08:44:35
Downloading dataset...
Skipping, found downloaded files in ".\trotro" (use force=True to force download)
✓ Dataset downloaded successfully
Using data directory: trotro\trotrolive-datasets
Found 11 city directories: ['abidjan', 'accra', 'addisababa', 'addisababa-minibus', 'alexendria', 'bamako', 'freetown', 'kampala', 'kumasi', 'lagos', 'nairobi']

PROCESSING CITY: ABIDJAN
Found 15 data files in abidjan: ['agency.txt', 'calendar.txt', 'calendar_dates.txt', 'fare_attributes.txt', 'fare_rules.txt', 'feed_info.txt', 'frequencies.txt', 'levels.txt', 'pathways.txt', 'routes.txt', 'shapes.txt', 'stops.txt', 'stop_times.txt', 'transfers.txt', 'trips.txt']

----------------------------------------
Processing: abidjan/agency.txt
----------------------------------------
  ✓ Loaded with separator ',': (25, 8)

Cleaning data for abidjan/agency.txt...
  - Converted 'agency_timezone' to datetime
  ✓ Clea

In [None]:
# Streamlined TroTro Unified ML Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import logging
from datetime import datetime
import joblib

# ML Libraries - Only the essentials
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, 
                           classification_report, confusion_matrix)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Try XGBoost (optional)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available - using sklearn alternatives")

warnings.filterwarnings('ignore')

class StreamlinedTroTroML:
    """Streamlined TroTro ML Pipeline - Unified Model Approach"""
    
    def __init__(self, data_dir="cleaned_trotro_data"):
        self.data_dir = Path(data_dir)
        self.output_dir = Path('streamlined_trotro_results')
        self.output_dir.mkdir(exist_ok=True)
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('trotro_ml.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        # Initialize storage
        self.combined_data = None
        self.models = {}
        self.best_models = {}
        
        print("🚀 Streamlined TroTro ML Pipeline Initialized")
        print(f"📁 Data directory: {self.data_dir}")
        print(f"📊 Output directory: {self.output_dir}")
    
    def load_and_combine_data(self):
        """Load all city data and combine into unified dataset"""
        self.logger.info("Loading and combining all city data...")
        
        all_dataframes = []
        city_data_dir = self.data_dir / 'by_city'
        
        if not city_data_dir.exists():
            self.logger.error(f"City data directory not found: {city_data_dir}")
            return None
        
        # Load data from all cities
        for city_path in city_data_dir.iterdir():
            if city_path.is_dir():
                city_name = city_path.name
                self.logger.info(f"Loading data for {city_name}")
                
                # Find CSV files in city directory
                csv_files = list(city_path.glob('*.csv'))
                
                for csv_file in csv_files:
                    try:
                        # Clean encoding handling
                        df = pd.read_csv(csv_file, encoding='utf-8', low_memory=False)
                        if df.empty:
                            continue
                            
                        # Add city identifier
                        df['city'] = city_name
                        df['source_file'] = csv_file.stem
                        
                        # Clean column names
                        df.columns = df.columns.str.strip().str.lower()
                        df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)
                        df.columns = df.columns.str.replace(r'_+', '_', regex=True)
                        
                        all_dataframes.append(df)
                        self.logger.info(f"  ✅ Loaded {csv_file.name}: {df.shape}")
                        
                    except Exception as e:
                        self.logger.warning(f"  ❌ Failed to load {csv_file}: {e}")
        
        # Also check for combined files directly in by_city
        for csv_file in city_data_dir.glob('*_combined.csv'):
            try:
                df = pd.read_csv(csv_file, encoding='utf-8', low_memory=False)
                city_name = csv_file.stem.replace('_combined', '')
                df['city'] = city_name
                df['source_file'] = csv_file.stem
                
                df.columns = df.columns.str.strip().str.lower()
                df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)
                
                all_dataframes.append(df)
                self.logger.info(f"✅ Loaded combined file for {city_name}: {df.shape}")
                
            except Exception as e:
                self.logger.warning(f"❌ Failed to load {csv_file}: {e}")
        
        if not all_dataframes:
            self.logger.error("No data files loaded successfully!")
            return None
        
        # Combine all dataframes
        self.logger.info("Combining all datasets...")
        
        # Find common columns across all dataframes
        common_columns = set(all_dataframes[0].columns)
        for df in all_dataframes[1:]:
            common_columns = common_columns.intersection(set(df.columns))
        
        if len(common_columns) < 3:
            self.logger.warning("Few common columns found. Using all available data.")
            self.combined_data = pd.concat(all_dataframes, ignore_index=True, sort=False)
        else:
            # Use only common columns for consistency
            combined_dfs = [df[list(common_columns)] for df in all_dataframes]
            self.combined_data = pd.concat(combined_dfs, ignore_index=True)
        
        # Basic cleaning
        self.combined_data = self.combined_data.dropna(how='all')  # Remove empty rows
        self.combined_data = self.combined_data.loc[:, self.combined_data.notna().any()]  # Remove empty columns
        
        self.logger.info(f"✅ Combined dataset created: {self.combined_data.shape}")
        self.logger.info(f"📊 Cities included: {self.combined_data['city'].nunique()}")
        
        return self.combined_data
    
    def visualize_dataset(self):
        """Create comprehensive visualizations of the combined dataset"""
        if self.combined_data is None:
            self.logger.error("No data loaded for visualization")
            return
        
        self.logger.info("Creating dataset visualizations...")
        
        # Set up plotting style
        plt.style.use('default')
        sns.set_palette("husl")
        
        # Create visualization directory
        viz_dir = self.output_dir / 'visualizations'
        viz_dir.mkdir(exist_ok=True)
        
        # 1. Dataset Overview
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('TroTro Dataset Overview', fontsize=16, fontweight='bold')
        
        # Dataset size by city
        city_counts = self.combined_data['city'].value_counts()
        axes[0, 0].bar(city_counts.index, city_counts.values, color='skyblue', edgecolor='navy')
        axes[0, 0].set_title('Records per City')
        axes[0, 0].set_xlabel('City')
        axes[0, 0].set_ylabel('Number of Records')
        axes[0, 0].tick_params(axis='x', rotation=45)
        
        # Missing data heatmap
        missing_data = self.combined_data.isnull().sum().sort_values(ascending=False)
        top_missing = missing_data.head(10)
        if len(top_missing) > 0:
            axes[0, 1].bar(range(len(top_missing)), top_missing.values, color='lightcoral')
            axes[0, 1].set_title('Top 10 Columns with Missing Data')
            axes[0, 1].set_xlabel('Columns')
            axes[0, 1].set_ylabel('Missing Values Count')
            axes[0, 1].set_xticks(range(len(top_missing)))
            axes[0, 1].set_xticklabels(top_missing.index, rotation=45, ha='right')
        
        # Data types distribution
        dtype_counts = self.combined_data.dtypes.value_counts()
        axes[1, 0].pie(dtype_counts.values, labels=dtype_counts.index, autopct='%1.1f%%')
        axes[1, 0].set_title('Data Types Distribution')
        
        # Column count by type
        numeric_cols = self.combined_data.select_dtypes(include=[np.number]).columns
        categorical_cols = self.combined_data.select_dtypes(include=['object', 'category']).columns
        
        col_types = {'Numeric': len(numeric_cols), 'Categorical': len(categorical_cols)}
        axes[1, 1].bar(col_types.keys(), col_types.values(), color=['lightgreen', 'orange'])
        axes[1, 1].set_title('Column Types Count')
        axes[1, 1].set_ylabel('Number of Columns')
        
        plt.tight_layout()
        plt.savefig(viz_dir / 'dataset_overview.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Numeric Features Analysis
        if len(numeric_cols) > 0:
            # Exclude city-related columns for better analysis
            analysis_numeric = [col for col in numeric_cols if col not in ['city', 'source_file']]
            
            if len(analysis_numeric) > 0:
                n_cols = min(4, len(analysis_numeric))
                n_rows = (len(analysis_numeric) + n_cols - 1) // n_cols
                
                fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
                fig.suptitle('Numeric Features Distribution', fontsize=16, fontweight='bold')
                
                if n_rows == 1:
                    axes = axes if n_cols > 1 else [axes]
                else:
                    axes = axes.flatten()
                
                for i, col in enumerate(analysis_numeric[:len(axes)]):
                    try:
                        data_to_plot = self.combined_data[col].dropna()
                        if len(data_to_plot) > 0:
                            axes[i].hist(data_to_plot, bins=30, alpha=0.7, color='skyblue', edgecolor='navy')
                            axes[i].set_title(f'{col}\n(n={len(data_to_plot)})')
                            axes[i].set_xlabel(col)
                            axes[i].set_ylabel('Frequency')
                    except Exception as e:
                        axes[i].text(0.5, 0.5, f'Error plotting {col}', 
                                   ha='center', va='center', transform=axes[i].transAxes)
                
                # Hide empty subplots
                for i in range(len(analysis_numeric), len(axes)):
                    axes[i].set_visible(False)
                
                plt.tight_layout()
                plt.savefig(viz_dir / 'numeric_features.png', dpi=300, bbox_inches='tight')
                plt.close()
        
        # 3. Categorical Features Analysis
        if len(categorical_cols) > 0:
            analysis_categorical = [col for col in categorical_cols 
                                  if col not in ['city', 'source_file'] and 
                                  self.combined_data[col].nunique() <= 20]
            
            if len(analysis_categorical) > 0:
                n_cols = min(3, len(analysis_categorical))
                n_rows = (len(analysis_categorical) + n_cols - 1) // n_cols
                
                fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
                fig.suptitle('Categorical Features Distribution', fontsize=16, fontweight='bold')
                
                if n_rows == 1:
                    axes = axes if n_cols > 1 else [axes]
                else:
                    axes = axes.flatten()
                
                for i, col in enumerate(analysis_categorical[:len(axes)]):
                    try:
                        value_counts = self.combined_data[col].value_counts().head(15)
                        axes[i].bar(range(len(value_counts)), value_counts.values, color='lightgreen')
                        axes[i].set_title(f'{col}\n(Unique: {self.combined_data[col].nunique()})')
                        axes[i].set_xlabel('Categories')
                        axes[i].set_ylabel('Count')
                        axes[i].set_xticks(range(len(value_counts)))
                        axes[i].set_xticklabels(value_counts.index, rotation=45, ha='right')
                    except Exception as e:
                        axes[i].text(0.5, 0.5, f'Error plotting {col}', 
                                   ha='center', va='center', transform=axes[i].transAxes)
                
                # Hide empty subplots
                for i in range(len(analysis_categorical), len(axes)):
                    axes[i].set_visible(False)
                
                plt.tight_layout()
                plt.savefig(viz_dir / 'categorical_features.png', dpi=300, bbox_inches='tight')
                plt.close()
        
        # 4. Correlation Analysis
        numeric_data = self.combined_data.select_dtypes(include=[np.number])
        if len(numeric_data.columns) > 1:
            plt.figure(figsize=(12, 10))
            correlation_matrix = numeric_data.corr()
            
            # Create mask for upper triangle
            mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
            
            sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
                       center=0, square=True, fmt='.2f')
            plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
            plt.tight_layout()
            plt.savefig(viz_dir / 'correlation_matrix.png', dpi=300, bbox_inches='tight')
            plt.close()
        
        self.logger.info(f"✅ Visualizations saved to {viz_dir}")
    
    def identify_ml_targets(self):
        """Identify potential ML targets in the dataset"""
        if self.combined_data is None:
            self.logger.error("No data available for target identification")
            return []
        
        self.logger.info("Identifying potential ML targets...")
        
        targets = []
        
        # Transport-specific keywords for targets
        regression_keywords = ['fare', 'price', 'cost', 'duration', 'time', 'distance', 'speed']
        classification_keywords = ['route', 'destination', 'origin', 'station', 'vehicle_type', 'status']
        
        # Check each column
        for col in self.combined_data.columns:
            if col in ['city', 'source_file']:
                continue
            
            col_lower = col.lower()
            non_null_count = self.combined_data[col].notna().sum()
            total_count = len(self.combined_data)
            
            # Skip columns with too much missing data
            if non_null_count < total_count * 0.5:
                continue
            
            # Check for regression targets
            for keyword in regression_keywords:
                if keyword in col_lower:
                    if (col in self.combined_data.select_dtypes(include=[np.number]).columns and 
                        self.combined_data[col].nunique() > 10):
                        targets.append(('regression', col))
                        self.logger.info(f"  📈 Regression target found: {col}")
                        break
            
            # Check for classification targets
            for keyword in classification_keywords:
                if keyword in col_lower:
                    unique_count = self.combined_data[col].nunique()
                    if 2 <= unique_count <= 50:
                        targets.append(('classification', col))
                        self.logger.info(f"  📊 Classification target found: {col}")
                        break
        
        # If no specific targets found, use general heuristics
        if not targets:
            self.logger.info("No transport-specific targets found. Using general heuristics...")
            
            # General regression targets (numeric with enough variety)
            for col in self.combined_data.select_dtypes(include=[np.number]).columns:
                if col not in ['city', 'source_file'] and self.combined_data[col].nunique() > 10:
                    targets.append(('regression', col))
                    self.logger.info(f"  📈 General regression target: {col}")
            
            # General classification targets
            for col in self.combined_data.columns:
                if col not in ['city', 'source_file']:
                    unique_count = self.combined_data[col].nunique()
                    if 2 <= unique_count <= 20:
                        targets.append(('classification', col))
                        self.logger.info(f"  📊 General classification target: {col}")
        
        # Limit to top 5 most promising targets
        targets = targets[:5]
        self.logger.info(f"✅ Identified {len(targets)} ML targets")
        
        return targets
    
    def prepare_data(self, target_col, task_type):
        """Prepare data for ML with efficient preprocessing"""
        self.logger.info(f"Preparing data for {target_col} ({task_type})")
        
        # Create feature matrix
        X = self.combined_data.drop(columns=[target_col, 'source_file'], errors='ignore')
        y = self.combined_data[target_col].copy()
        
        # Remove samples with missing target
        mask = y.notna()
        X = X[mask]
        y = y[mask]
        
        if len(X) == 0:
            self.logger.error("No valid samples after removing missing targets")
            return None, None, None, None, None
        
        # Prepare target for classification
        if task_type == 'classification' and y.dtype == 'object':
            le = LabelEncoder()
            y = le.fit_transform(y.astype(str))
            self.target_encoder = le
        
        # Identify feature types
        numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Create preprocessing pipeline
        preprocessors = []
        
        if numeric_features:
            numeric_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ])
            preprocessors.append(('numeric', numeric_pipeline, numeric_features))
        
        if categorical_features:
            categorical_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ])
            preprocessors.append(('categorical', categorical_pipeline, categorical_features))
        
        if preprocessors:
            self.preprocessor = ColumnTransformer(preprocessors, remainder='drop')
            X_processed = self.preprocessor.fit_transform(X)
        else:
            self.logger.warning("No valid features for preprocessing")
            return None, None, None, None, None
        
        # Split data
        test_size = min(0.3, max(0.1, 100 / len(X_processed)))
        stratify = y if task_type == 'classification' else None
        
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X_processed, y, test_size=test_size, random_state=42, stratify=stratify
            )
        except ValueError:
            # Fallback without stratification
            X_train, X_test, y_train, y_test = train_test_split(
                X_processed, y, test_size=test_size, random_state=42
            )
        
        self.logger.info(f"✅ Data prepared: Train={X_train.shape}, Test={X_test.shape}")
        
        return X_train, X_test, y_train, y_test, (numeric_features + categorical_features)
    
    def train_models(self, X_train, X_test, y_train, y_test, task_type, target_col):
        """Train best-performing models efficiently"""
        self.logger.info(f"Training models for {target_col} ({task_type})")
        
        results = {}
        
        if task_type == 'regression':
            models = {
                'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
                'Linear Regression': LinearRegression(),
                'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
            }
            
            if XGBOOST_AVAILABLE:
                models['XGBoost'] = xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
            
            # Hyperparameter grids for key models
            param_grids = {
                'Random Forest': {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5, 10]
                },
                'Gradient Boosting': {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.05, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                }
            }
            
        else:  # classification
            models = {
                'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
                'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
                'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
            }
            
            if XGBOOST_AVAILABLE:
                models['XGBoost'] = xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)
            
            param_grids = {
                'Random Forest': {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5, 10]
                },
                'Logistic Regression': {
                    'C': [0.1, 1.0, 10.0],
                    'penalty': ['l1', 'l2'],
                    'solver': ['liblinear', 'saga']
                }
            }
        
        # Train and tune models
        for name, model in models.items():
            try:
                self.logger.info(f"  Training {name}...")
                
                # Hyperparameter tuning for select models
                if name in param_grids and len(X_train) > 100:
                    grid_search = GridSearchCV(
                        model, param_grids[name], cv=3, 
                        scoring='r2' if task_type == 'regression' else 'accuracy',
                        n_jobs=-1
                    )
                    grid_search.fit(X_train, y_train)
                    best_model = grid_search.best_estimator_
                    self.logger.info(f"    Best params: {grid_search.best_params_}")
                else:
                    best_model = model
                    best_model.fit(X_train, y_train)
                
                # Make predictions
                train_pred = best_model.predict(X_train)
                test_pred = best_model.predict(X_test)
                
                # Evaluate
                if task_type == 'regression':
                    train_r2 = r2_score(y_train, train_pred)
                    test_r2 = r2_score(y_test, test_pred)
                    test_mse = mean_squared_error(y_test, test_pred)
                    
                    results[name] = {
                        'model': best_model,
                        'train_r2': train_r2,
                        'test_r2': test_r2,
                        'test_mse': test_mse,
                        'train_pred': train_pred,
                        'test_pred': test_pred
                    }
                    
                    self.logger.info(f"    {name} - R²: {test_r2:.4f}, MSE: {test_mse:.4f}")
                    
                else:  # classification
                    train_acc = accuracy_score(y_train, train_pred)
                    test_acc = accuracy_score(y_test, test_pred)
                    
                    results[name] = {
                        'model': best_model,
                        'train_accuracy': train_acc,
                        'test_accuracy': test_acc,
                        'train_pred': train_pred,
                        'test_pred': test_pred
                    }
                    
                    self.logger.info(f"    {name} - Accuracy: {test_acc:.4f}")
                
            except Exception as e:
                self.logger.error(f"    Error training {name}: {e}")
                continue
        
        return results
    
    def create_model_visualizations(self, results, task_type, target_col, X_test, y_test):
        """Create visualizations for model results"""
        viz_dir = self.output_dir / 'model_results'
        viz_dir.mkdir(exist_ok=True)
        
        # Model comparison
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        fig.suptitle(f'Model Results - {target_col} ({task_type})', fontsize=16, fontweight='bold')
        
        model_names = list(results.keys())
        
        if task_type == 'regression':
            test_scores = [results[name]['test_r2'] for name in model_names]
            metric_name = 'R² Score'
        else:
            test_scores = [results[name]['test_accuracy'] for name in model_names]
            metric_name = 'Accuracy'
        
        # Performance comparison
        bars = axes[0].bar(model_names, test_scores, color='skyblue', edgecolor='navy')
        axes[0].set_title(f'Model {metric_name} Comparison')
        axes[0].set_ylabel(metric_name)
        axes[0].tick_params(axis='x', rotation=45)
        
        # Add value labels on bars
        for bar, score in zip(bars, test_scores):
            axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                        f'{score:.3f}', ha='center', va='bottom')
        
        # Best model predictions vs actual
        best_model_name = max(model_names, key=lambda x: test_scores[model_names.index(x)])
        best_predictions = results[best_model_name]['test_pred']
        
        if task_type == 'regression':
            axes[1].scatter(y_test, best_predictions, alpha=0.6, color='green')
            axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                        'r--', linewidth=2, label='Perfect Prediction')
            axes[1].set_xlabel('Actual Values')
            axes[1].set_ylabel('Predicted Values')
            axes[1].legend()
        else:
            from sklearn.metrics import confusion_matrix
            cm = confusion_matrix(y_test, best_predictions)
            sns.heatmap(cm, annot=True, fmt='d', ax=axes[1], cmap='Blues')
            axes[1].set_xlabel('Predicted')
            axes[1].set_ylabel('Actual')
        
        axes[1].set_title(f'Best Model: {best_model_name}')
        axes[1].grid(alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(viz_dir / f'{target_col}_{task_type}_results.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        self.logger.info(f"✅ Model visualization saved for {target_col}")
    
    def save_best_model(self, results, task_type, target_col):
        """Save the best performing model"""
        if not results:
            return
        
        model_dir = self.output_dir / 'models'
        model_dir.mkdir(exist_ok=True)
        
        # Find best model
        if task_type == 'regression':
            best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
        else:
            best_model_name = max(results.keys(), key=lambda x: results[x]['test_accuracy'])
        
        best_model = results[best_model_name]['model']
        
        # Save model and preprocessor
        model_info = {
            'model': best_model,
            'preprocessor': self.preprocessor,
            'target_encoder': getattr(self, 'target_encoder', None),
            'task_type': task_type,
            'target_column': target_col,
            'best_model_name': best_model_name,
            'results': results[best_model_name]
        }
        
        model_file = model_dir / f'{target_col}_{task_type}_best_model.pkl'
        joblib.dump(model_info, model_file)
        
        self.best_models[target_col] = model_info
        self.logger.info(f"✅ Best model saved: {best_model_name} for {target_col}")
    
    def generate_summary_report(self):
        """Generate comprehensive summary report"""
        report_file = self.output_dir / 'summary_report.md'
        
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("# TroTro Unified ML Pipeline - Summary Report\n\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            # Dataset Overview
            f.write("## Dataset Overview\n\n")
            if self.combined_data is not None:
                f.write(f"- **Total Records**: {len(self.combined_data):,}\n")
                f.write(f"- **Total Features**: {len(self.combined_data.columns)}\n")
                f.write(f"- **Cities Included**: {self.combined_data['city'].nunique()}\n")
                f.write(f"- **City List**: {', '.join(self.combined_data['city'].unique())}\n\n")
                
                # Missing data summary
                missing_pct = (self.combined_data.isnull().sum().sum() / 
                             (len(self.combined_data) * len(self.combined_data.columns))) * 100
                f.write(f"- **Overall Missing Data**: {missing_pct:.2f}%\n\n")
            
            # Model Results
            f.write("## Model Results Summary\n\n")
            
            if self.best_models:
                f.write("| Target | Task Type | Best Model | Performance |\n")
                f.write("|--------|-----------|------------|-------------|\n")
                
                for target, model_info in self.best_models.items():
                    task_type = model_info['task_type']
                    best_model = model_info['best_model_name']
                    
                    if task_type == 'regression':
                        score = model_info['results']['test_r2']
                        metric = 'R²'
                    else:
                        score = model_info['results']['test_accuracy']
                        metric = 'Accuracy'
                    
                    f.write(f"| {target} | {task_type} | {best_model} | {score:.4f} ({metric}) |\n")
                
                f.write("\n")
            
            # Key Insights
            f.write("## Key Insights\n\n")
            
            if self.best_models:
                # Best performing model overall
                all_scores = []
                for model_info in self.best_models.values():
                    if model_info['task_type'] == 'regression':
                        all_scores.append((model_info['results']['test_r2'], 
                                         model_info['best_model_name'], 'regression'))
                    else:
                        all_scores.append((model_info['results']['test_accuracy'], 
                                         model_info['best_model_name'], 'classification'))
                
                if all_scores:
                    best_overall = max(all_scores, key=lambda x: x[0])
                    f.write(f"- **Best Overall Performance**: {best_overall[1]} "
                          f"({best_overall[2]}) with score {best_overall[0]:.4f}\n")
                
                # Algorithm popularity
                algorithm_counts = {}
                for model_info in self.best_models.values():
                    algo = model_info['best_model_name']
                    algorithm_counts[algo] = algorithm_counts.get(algo, 0) + 1
                
                most_successful = max(algorithm_counts.items(), key=lambda x: x[1])
                f.write(f"- **Most Successful Algorithm**: {most_successful[0]} "
                      f"(won {most_successful[1]} targets)\n")
            
            f.write("\n## Recommendations\n\n")
            f.write("1. **Data Quality**: Focus on improving data collection consistency across cities\n")
            f.write("2. **Feature Engineering**: Consider creating city-specific features or interactions\n")
            f.write("3. **Model Deployment**: Prioritize models with highest performance for production\n")
            f.write("4. **Monitoring**: Set up performance monitoring for deployed models\n\n")
            
            f.write("---\n")
            f.write("*Report generated by Streamlined TroTro ML Pipeline*\n")
        
        self.logger.info(f"✅ Summary report saved: {report_file}")
    
    def run_complete_pipeline(self):
        """Run the complete streamlined ML pipeline"""
        print("\n🚀 Starting Streamlined TroTro ML Pipeline")
        print("="*50)
        
        try:
            # Step 1: Load and combine data
            print("\n📊 Step 1: Loading and combining data...")
            if self.load_and_combine_data() is None:
                print("❌ Failed to load data. Exiting.")
                return
            
            # Step 2: Visualize dataset
            print("\n📈 Step 2: Creating dataset visualizations...")
            self.visualize_dataset()
            
            # Step 3: Identify ML targets
            print("\n🎯 Step 3: Identifying ML targets...")
            targets = self.identify_ml_targets()
            
            if not targets:
                print("❌ No ML targets identified. Exiting.")
                return
            
            # Step 4: Train models for each target
            print(f"\n🤖 Step 4: Training models for {len(targets)} targets...")
            
            for i, (task_type, target_col) in enumerate(targets, 1):
                print(f"\n  Target {i}/{len(targets)}: {target_col} ({task_type})")
                
                # Prepare data
                data_result = self.prepare_data(target_col, task_type)
                if data_result[0] is None:
                    print(f"    ❌ Data preparation failed for {target_col}")
                    continue
                
                X_train, X_test, y_train, y_test, features = data_result
                
                # Train models
                results = self.train_models(X_train, X_test, y_train, y_test, task_type, target_col)
                
                if results:
                    # Create visualizations
                    self.create_model_visualizations(results, task_type, target_col, X_test, y_test)
                    
                    # Save best model
                    self.save_best_model(results, task_type, target_col)
                    
                    print(f"    ✅ Completed {target_col}")
                else:
                    print(f"    ❌ No models trained successfully for {target_col}")
            
            # Step 5: Generate report
            print("\n📋 Step 5: Generating summary report...")
            self.generate_summary_report()
            
            # Final summary
            print(f"\n🎉 Pipeline completed successfully!")
            print(f"📁 Results saved in: {self.output_dir}")
            print(f"📊 Visualizations: {self.output_dir}/visualizations/")
            print(f"🤖 Models: {self.output_dir}/models/")
            print(f"📈 Model Results: {self.output_dir}/model_results/")
            print(f"📋 Summary Report: {self.output_dir}/summary_report.md")
            
            # Print quick results
            if self.best_models:
                print(f"\n📊 Quick Results Summary:")
                print("-" * 40)
                for target, model_info in self.best_models.items():
                    task_type = model_info['task_type']
                    best_model = model_info['best_model_name']
                    
                    if task_type == 'regression':
                        score = model_info['results']['test_r2']
                        metric = 'R²'
                    else:
                        score = model_info['results']['test_accuracy']
                        metric = 'Accuracy'
                    
                    print(f"{target:20} | {best_model:15} | {metric}: {score:.4f}")
            
        except Exception as e:
            self.logger.error(f"Pipeline failed: {e}")
            print(f"❌ Pipeline failed: {e}")
            import traceback
            traceback.print_exc()


# Usage and execution
if __name__ == "__main__":
    # Create and run the streamlined pipeline
    pipeline = StreamlinedTroTroML()
    pipeline.run_complete_pipeline()