In [2]:
# Install required packages if not already installed
%pip install opendatasets pandas numpy legacy-cgi

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Enhanced TroTro Dataset Cleaning and Analysis
import opendatasets as od
import pandas as pd
import numpy as np
import os
import warnings
from datetime import datetime
import re

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("=== TroTro Dataset Cleaning and Analysis ===")
print(f"Starting analysis at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# 1. Download dataset with error handling
def download_dataset():
    """Download the TroTro dataset from Kaggle"""
    try:
        dataset_url = 'https://www.kaggle.com/datasets/godfredaddaiamoako/trotro'
        print("Downloading dataset...")
        od.download(dataset_url)
        print("✓ Dataset downloaded successfully")
    except Exception as e:
        print(f"⚠ Error downloading dataset: {e}")
        print("Please ensure you have Kaggle credentials configured")
        return False
    return True

# 2. Enhanced data cleaning function
def clean_data(df, filename=""):
    """Comprehensive data cleaning function"""
    print(f"\nCleaning data for {filename}...")
    original_shape = df.shape
    
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # 1. Remove completely empty rows and columns
    df_clean = df_clean.dropna(how='all').dropna(axis=1, how='all')
    
    # 2. Clean column names
    df_clean.columns = df_clean.columns.str.strip().str.lower()
    df_clean.columns = df_clean.columns.str.replace(' ', '_').str.replace(r'[^\w]', '_', regex=True)
    
    # 3. Handle duplicates
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    duplicates_removed = initial_rows - len(df_clean)
    if duplicates_removed > 0:
        print(f"  - Removed {duplicates_removed} duplicate rows")
    
    # 4. Clean text columns
    text_columns = df_clean.select_dtypes(include=['object']).columns
    for col in text_columns:
        if col in df_clean.columns:
            # Strip whitespace and handle common issues
            df_clean[col] = df_clean[col].astype(str).str.strip()
            df_clean[col] = df_clean[col].replace(['nan', 'NaN', 'None', ''], np.nan)
            
            # Clean special characters and normalize text
            df_clean[col] = df_clean[col].str.replace(r'\s+', ' ', regex=True)
            
    # 5. Handle missing values intelligently
    missing_threshold = 0.7  # Drop columns with >70% missing data
    for col in df_clean.columns:
        missing_pct = df_clean[col].isnull().sum() / len(df_clean)
        if missing_pct > missing_threshold:
            print(f"  - Dropped column '{col}' (>{missing_threshold*100}% missing)")
            df_clean = df_clean.drop(columns=[col])
    
    # 6. Fill remaining missing values based on data type
    for col in df_clean.columns:
        if df_clean[col].isnull().any():
            if df_clean[col].dtype in ['int64', 'float64']:
                # For numeric columns, use median
                df_clean[col] = df_clean[col].fillna(df_clean[col].median())
            else:
                # For categorical columns, use mode or 'Unknown'
                mode_val = df_clean[col].mode()
                if len(mode_val) > 0:
                    df_clean[col] = df_clean[col].fillna(mode_val[0])
                else:
                    df_clean[col] = df_clean[col].fillna('Unknown')
    
    # 7. Detect and handle potential date columns
    potential_date_cols = [col for col in df_clean.columns if 'date' in col.lower() or 'time' in col.lower()]
    for col in potential_date_cols:
        try:
            df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
            print(f"  - Converted '{col}' to datetime")
        except:
            pass
    
    # 8. Clean numeric columns
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        # Remove outliers using IQR method
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers_before = len(df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)])
        df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
        if outliers_before > 0:
            print(f"  - Capped {outliers_before} outliers in '{col}'")
    
    print(f"  ✓ Cleaned data shape: {original_shape} → {df_clean.shape}")
    return df_clean

# 3. Data analysis function
def analyze_data(df, filename=""):
    """Perform comprehensive data analysis"""
    print(f"\n--- Analysis for {filename} ---")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values analysis
    missing_data = df.isnull().sum()
    if missing_data.sum() > 0:
        print("\nMissing values:")
        print(missing_data[missing_data > 0])
    else:
        print("\n✓ No missing values!")
    
    # Data types
    print(f"\nData types:")
    print(df.dtypes.value_counts())
    
    # Numeric columns summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nNumeric columns summary:")
        print(df[numeric_cols].describe())
    
    # Categorical columns info
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\nCategorical columns (top 5 each):")
        for col in categorical_cols[:5]:  # Limit to first 5 to avoid clutter
            print(f"\n{col}:")
            print(df[col].value_counts().head())
    
    # Display sample
    print(f"\nSample data (first 5 rows):")
    print(df.head())
    
    return df

# 4. Main execution
def main():
    """Main execution function"""
    
    # Download dataset
    if not download_dataset():
        print("Failed to download dataset. Please check your Kaggle credentials.")
        return
    
    # Find data directory
    data_dir = 'trotro/trotrolive-datasets'
    if not os.path.exists(data_dir):
        # Try alternative directory names
        possible_dirs = [d for d in os.listdir('.') if 'trotro' in d.lower()]
        if possible_dirs:
            data_dir = possible_dirs[0]
        else:
            print("Could not find dataset directory")
            return
    
    print(f"Using data directory: {data_dir}")
    
    # Find TXT files
    try:
        data_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
        print(f"Found {len(data_files)} TXT files: {data_files}")
    except Exception as e:
        print(f"Error accessing data directory: {e}")
        return
    
    if not data_files:
        print("No TXT files found in the directory")
        return
    
    # Process each file
    cleaned_dataframes = {}
    
    for file in data_files:
        try:
            file_path = os.path.join(data_dir, file)
            print(f"\n{'='*50}")
            print(f"Processing: {file}")
            print(f"{'='*50}")
            
            # Load data (try different separators for .txt files)
            try:
                # Try common separators for text files
                df = pd.read_csv(file_path, sep='\t')  # Tab-separated
                print(f"Original data loaded with tab separator: {df.shape}")
            except:
                try:
                    df = pd.read_csv(file_path, sep=',')  # Comma-separated
                    print(f"Original data loaded with comma separator: {df.shape}")
                except:
                    try:
                        df = pd.read_csv(file_path, sep='|')  # Pipe-separated
                        print(f"Original data loaded with pipe separator: {df.shape}")
                    except:
                        # Try space-separated with flexible whitespace
                        df = pd.read_csv(file_path, sep=r'\s+', engine='python')
                        print(f"Original data loaded with space separator: {df.shape}")
            
            # Check if data loaded properly
            if df.empty:
                print(f"⚠ Warning: {file} appears to be empty or couldn't be parsed")
                continue
            
            # Clean data
            df_clean = clean_data(df, file)
            
            # Analyze cleaned data
            df_analyzed = analyze_data(df_clean, file)
            
            # Save cleaned data
            cleaned_filename = f'cleaned_{file.replace(".txt", ".csv")}'  # Save as CSV for easier handling
            df_clean.to_csv(cleaned_filename, index=False)
            print(f"✓ Saved cleaned data to: {cleaned_filename}")
            
            # Store for potential combination
            cleaned_dataframes[file] = df_clean
            
        except Exception as e:
            print(f"❌ Error processing {file}: {e}")
            continue
    
    # 5. Combine data if multiple files exist and have similar structure
    if len(cleaned_dataframes) > 1:
        print(f"\n{'='*50}")
        print("ATTEMPTING TO COMBINE DATASETS")
        print(f"{'='*50}")
        
        try:
            # Check if files have similar columns
            all_columns = [set(df.columns) for df in cleaned_dataframes.values()]
            common_columns = set.intersection(*all_columns)
            
            if len(common_columns) > 0:
                print(f"Found {len(common_columns)} common columns")
                
                # Combine datasets using common columns
                combined_dfs = []
                for filename, df in cleaned_dataframes.items():
                    df_subset = df[list(common_columns)].copy()
                    df_subset['source_file'] = filename
                    combined_dfs.append(df_subset)
                
                combined_data = pd.concat(combined_dfs, ignore_index=True)
                
                # Analyze combined data
                print("\nCombined dataset analysis:")
                analyze_data(combined_data, "Combined Dataset")
                
                # Save combined data
                combined_data.to_csv('combined_trotro_data.csv', index=False)
                print("✓ Saved combined data to: combined_trotro_data.csv")
                
            else:
                print("⚠ No common columns found - cannot combine datasets")
                
        except Exception as e:
            print(f"❌ Error combining datasets: {e}")
    
    print(f"\n{'='*50}")
    print("DATA CLEANING COMPLETE!")
    print(f"Finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*50}")

# Execute the main function
if __name__ == "__main__":
    main()

=== TroTro Dataset Cleaning and Analysis ===
Starting analysis at: 2025-08-10 12:06:00
Downloading dataset...
Skipping, found downloaded files in ".\trotro" (use force=True to force download)
✓ Dataset downloaded successfully
Using data directory: trotro
Found 0 TXT files: []
No TXT files found in the directory
