In [None]:
# Install required packages if not already installed
%pip install opendatasets pandas numpy legacy-cgi

In [18]:
# Enhanced TroTro Dataset Cleaning and Analysis
import opendatasets as od
import pandas as pd
import numpy as np
import os
import warnings
from datetime import datetime
import re

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("=== TroTro Dataset Cleaning and Analysis ===")
print(f"Starting analysis at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# 1. Download dataset with error handling
def download_dataset():
    """Download the TroTro dataset from Kaggle"""
    try:
        dataset_url = 'https://www.kaggle.com/datasets/godfredaddaiamoako/trotro'
        print("Downloading dataset...")
        od.download(dataset_url)
        print("✓ Dataset downloaded successfully")
    except Exception as e:
        print(f"⚠ Error downloading dataset: {e}")
        print("Please ensure you have Kaggle credentials configured")
        return False
    return True

# 2. Enhanced data cleaning function
def clean_data(df, filename=""):
    """Comprehensive data cleaning function"""
    print(f"\nCleaning data for {filename}...")
    original_shape = df.shape
    
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # 1. Remove completely empty rows and columns
    df_clean = df_clean.dropna(how='all').dropna(axis=1, how='all')
    
    # 2. Clean column names
    df_clean.columns = df_clean.columns.str.strip().str.lower()
    df_clean.columns = df_clean.columns.str.replace(' ', '_').str.replace(r'[^\w]', '_', regex=True)
    
    # 3. Handle duplicates
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    duplicates_removed = initial_rows - len(df_clean)
    if duplicates_removed > 0:
        print(f"  - Removed {duplicates_removed} duplicate rows")
    
    # 4. Clean text columns
    text_columns = df_clean.select_dtypes(include=['object']).columns
    for col in text_columns:
        if col in df_clean.columns:
            # Strip whitespace and handle common issues
            df_clean[col] = df_clean[col].astype(str).str.strip()
            df_clean[col] = df_clean[col].replace(['nan', 'NaN', 'None', ''], np.nan)
            
            # Clean special characters and normalize text
            df_clean[col] = df_clean[col].str.replace(r'\s+', ' ', regex=True)
            
    # 5. Handle missing values intelligently
    missing_threshold = 0.7  # Drop columns with >70% missing data
    for col in df_clean.columns:
        missing_pct = df_clean[col].isnull().sum() / len(df_clean)
        if missing_pct > missing_threshold:
            print(f"  - Dropped column '{col}' (>{missing_threshold*100}% missing)")
            df_clean = df_clean.drop(columns=[col])
    
    # 6. Fill remaining missing values based on data type
    for col in df_clean.columns:
        if df_clean[col].isnull().any():
            if df_clean[col].dtype in ['int64', 'float64']:
                # For numeric columns, use median
                df_clean[col] = df_clean[col].fillna(df_clean[col].median())
            else:
                # For categorical columns, use mode or 'Unknown'
                mode_val = df_clean[col].mode()
                if len(mode_val) > 0:
                    df_clean[col] = df_clean[col].fillna(mode_val[0])
                else:
                    df_clean[col] = df_clean[col].fillna('Unknown')
    
    # 7. Detect and handle potential date columns
    potential_date_cols = [col for col in df_clean.columns if 'date' in col.lower() or 'time' in col.lower()]
    for col in potential_date_cols:
        try:
            df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
            print(f"  - Converted '{col}' to datetime")
        except:
            pass
    
    # 8. Clean numeric columns
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        # Remove outliers using IQR method
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers_before = len(df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)])
        df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
        if outliers_before > 0:
            print(f"  - Capped {outliers_before} outliers in '{col}'")
    
    print(f"  ✓ Cleaned data shape: {original_shape} → {df_clean.shape}")
    return df_clean

# 3. Data analysis function
def analyze_data(df, filename=""):
    """Perform comprehensive data analysis"""
    print(f"\n--- Analysis for {filename} ---")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values analysis
    missing_data = df.isnull().sum()
    if missing_data.sum() > 0:
        print("\nMissing values:")
        print(missing_data[missing_data > 0])
    else:
        print("\n✓ No missing values!")
    
    # Data types
    print(f"\nData types:")
    print(df.dtypes.value_counts())
    
    # Numeric columns summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nNumeric columns summary:")
        print(df[numeric_cols].describe())
    
    # Categorical columns info
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\nCategorical columns (top 5 each):")
        for col in categorical_cols[:5]:  # Limit to first 5 to avoid clutter
            print(f"\n{col}:")
            print(df[col].value_counts().head())
    
    # Display sample
    print(f"\nSample data (first 5 rows):")
    print(df.head())
    
    return df

# 4. Main execution
def main():
    """Main execution function"""
    
    # Download dataset
    if not download_dataset():
        print("Failed to download dataset. Please check your Kaggle credentials.")
        return
    
    # ------------------------------------------
    # 1. FIND DATA DIRECTORY
    # ------------------------------------------
    data_dir = './trotro/trotrolive-datasets'
    if not os.path.exists(data_dir):
        possible_dirs = [d for d in os.listdir('.') if 'trotro' in d.lower()]
        if possible_dirs:
            data_dir = possible_dirs[0]
        else:
            print("Could not find dataset directory")
            exit()
    
    print(f"Using data directory: {data_dir}")
    
    # ------------------------------------------
    # 2. FIND TXT FILES (RECURSIVE)
    # ------------------------------------------
    try:
        data_files = []
        for root, _, files in os.walk(data_dir):
            for f in files:
                if f.endswith('.txt'):
                    rel_path = os.path.relpath(os.path.join(root, f), data_dir)
                    data_files.append(rel_path)
        print(f"Found {len(data_files)} TXT files.")
    except Exception as e:
        print(f"Error accessing data directory: {e}")
        exit()
    
    if not data_files:
        print("No TXT files found in the directory")
        exit()
    
    # ------------------------------------------
    # 3. CLEAN AND SAVE CSVs (MIRROR FOLDER STRUCTURE)
    # ------------------------------------------
    for file in data_files:
        try:
            file_path = os.path.join(data_dir, file)
            print(f"\n{'='*50}")
            print(f"Processing: {file}")
            print(f"{'='*50}")
    
            # Try multiple separators — validate that parsing actually worked
            possible_separators = ["\t", ",", "|", r"\s+"]
            df = None
            for sep in possible_separators:
                try:
                    if sep == r"\s+":
                        temp_df = pd.read_csv(file_path, sep=sep, engine="python")
                    else:
                        temp_df = pd.read_csv(file_path, sep=sep)
    
                    # Check if we got multiple columns
                    if temp_df.shape[1] > 1:
                        df = temp_df
                        print(f"Loaded with separator {repr(sep)}: {df.shape}")
                        break
                except Exception as e:
                    continue
    
            if df is None or df.empty:
                print(f"⚠ Warning: {file} could not be parsed into multiple columns")
                continue
    
            # Clean
            df_clean = clean_data(df, file)
    
            # Analyze
            analyze_data(df_clean, file)
    
            # Build output path mirroring original folder structure
            cleaned_filename = f'cleaned_{os.path.basename(file).replace(".txt", ".csv")}'
            output_path = os.path.join("cleaned_data", os.path.dirname(file), cleaned_filename)
    
            # Ensure folder exists
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
            # Save
            df_clean.to_csv(output_path, index=False)
            print(f"✓ Saved cleaned data to: {output_path}")
    
        except Exception as e:
            print(f"❌ Error processing {file}: {e}")
            continue

    
    # ------------------------------------------
    # 4. COMBINE CLEANED CSV FILES BY BASE NAME
    # ------------------------------------------
    print(f"\n{'='*50}")
    print("ATTEMPTING TO COMBINE CLEANED DATASETS BY FILE NAME")
    print(f"{'='*50}")
    
    grouped_cleaned = {}
    cleaned_root = "cleaned_data"
    
    if not os.path.exists(cleaned_root):
        print("No cleaned_data directory found, skipping combine step")
    else:
        total_found = 0
        for root, _, files in os.walk(cleaned_root):
            for f in files:
                if not f.lower().endswith(".csv"):
                    continue
                total_found += 1
                file_path = os.path.join(root, f)
    
                # Match back to original TXT filename
                filename = f
                if filename.startswith("cleaned_"):
                    filename = filename[len("cleaned_"):]
                base_name = os.path.splitext(filename)[0] + ".txt"
    
                try:
                    df = pd.read_csv(file_path, sep=",")  # Ensure correct delimiter
                    grouped_cleaned.setdefault(base_name, []).append(df)
                except Exception as e:
                    print(f"❌ Error loading cleaned CSV {file_path}: {e}")
    
        print(f"Found {total_found} cleaned CSV files across cleaned_data; {len(grouped_cleaned)} groups")
    
        # ------------------------------------------
        # 5. COMBINE PER FILE GROUP
        # ------------------------------------------
        for base_name, df_list in grouped_cleaned.items():
            try:
                if len(df_list) > 1:
                    all_columns = [set(df.columns) for df in df_list]
                    common_columns = set.intersection(*all_columns)
                else:
                    common_columns = set(df_list[0].columns)
    
                if not common_columns:
                    print(f"⚠ No common columns found for '{base_name}' - skipping")
                    continue
    
                if len(df_list) > 1:
                    print(f"\nCombining {len(df_list)} files for '{base_name}' with {len(common_columns)} common columns")
                else:
                    print(f"\nOnly one cleaned file for '{base_name}', saving as combined output")
    
                # Keep column order from first DF
                reference_df = df_list[0]
                common_cols_ordered = [c for c in reference_df.columns if c in common_columns]
    
                combined_dfs = [df[common_cols_ordered].copy() for df in df_list]
                combined_data = pd.concat(combined_dfs, ignore_index=True)
    
                # Analyze
                analyze_data(combined_data, f"Combined {base_name}")
    
                # Save
                os.makedirs("combined_data", exist_ok=True)
                combined_filename = os.path.splitext(base_name)[0] + ".csv"
                combined_path = os.path.join("combined_data", combined_filename)
                combined_data.to_csv(combined_path, index=False)
                print(f"✓ Saved combined data to: {combined_path}")
    
            except Exception as e:
                print(f"❌ Error combining datasets for '{base_name}': {e}")
    
    print(f"\n{'='*50}")
    print("DATA CLEANING COMPLETE!")
    print(f"Finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*50}")

# Execute the main function
if __name__ == "__main__":
    main()

=== TroTro Dataset Cleaning and Analysis ===
Starting analysis at: 2025-08-11 02:56:13
Downloading dataset...
Skipping, found downloaded files in ".\trotro" (use force=True to force download)
✓ Dataset downloaded successfully
Using data directory: ./trotro/trotrolive-datasets
Found 110 TXT files.

Processing: abidjan\agency.txt
Loaded with separator ',': (25, 8)

Cleaning data for abidjan\agency.txt...
  - Converted 'agency_timezone' to datetime
  ✓ Cleaned data shape: (25, 8) → (25, 4)

--- Analysis for abidjan\agency.txt ---
Shape: (25, 4)
Memory usage: 0.01 MB

Missing values:
agency_timezone    25
dtype: int64

Data types:
object            3
datetime64[ns]    1
Name: count, dtype: int64

Categorical columns (top 5 each):

agency_id:
agency_id
Gbaka d'Abobo           1
Gbaka d'Adjamé          1
Gbaka d'Attécoubé       1
Gbaka de Bingerville    1
Gbaka de Cocody         1
Name: count, dtype: int64

agency_name:
agency_name
Woro-woro d'Attécoubé    3
Gbaka d'Adjamé           1
Gbaka 