In [2]:
import re
import os
import io
import pandas as pd
import numpy as np
from datetime import datetime
from tabulate import tabulate

# ------------------- CONFIG --------------------
source_folder = r"C:\Users\Ravi Pal\my_projects\project_p767\Taxi_management_db\data\manual_operation_data"
destination_folder = r"C:\Users\Ravi Pal\my_projects\project_p767\Taxi_management_db\data\manul_files"
os.makedirs(destination_folder, exist_ok=True)

all_data_frames = []  # List to hold all the pieces

if __name__ == "__main__":
    files_found = 0
    print(f"Scanning folder: {source_folder}...\n")

    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.endswith(('.xls', '.xlsx')):
                files_found += 1
                file_path = os.path.join(root, file)

                try:
                    # --- Step A: Read the dirty Excel file ---
                    # header=None ensures we grab every row as raw data
                    raw_df = pd.read_excel(file_path, header=None)

                    # CHECK: If file is effectively empty, skip it to avoid errors
                    if raw_df.empty:
                        print(f"Skipping empty file: {file}")
                        continue

                    # --- Step B: "Convert" to CSV in memory ---
                    csv_buffer = io.StringIO()
                    raw_df.to_csv(csv_buffer, index=False, header=False)
                    csv_buffer.seek(0) # Rewind buffer

                    # --- Step C: Read it back as a clean CSV ---
                    try:
                        df = pd.read_csv(csv_buffer, header=None)
                    except pd.errors.EmptyDataError:
                        print(f"Skipping file (no columns found): {file}")
                        continue

                    # --- Step D: YOUR CLEANING LOGIC ---
                    target_str = "EMPLOYEE ADDRESS"

                    # 1. Clean whitespace and drop empty rows
                    df = df.replace(r'^\s*$', np.nan, regex=True)
                    df = df.dropna(how='all')
                    df = df.reset_index(drop=True)

                    # 2. Logic for 'reporting_location' (Safety check included)
                    # We check if column 4 exists to prevent crashes on smaller files
                    if 4 in df.columns:
                        df["reporting_location"] = df[4].astype(str).apply(
                            lambda x: x if target_str in x else ""
                        )
                    else:
                        df["reporting_location"] = ""

                    # 3. Add source filename (Very useful for debugging later)
                    df['source_file'] = file

                    # --- Step E: Collect the result ---
                    all_data_frames.append(df)
                    print(f"Processed: {file} | Rows: {len(df)}")

                except Exception as e:
                    print(f"Error processing {file}: {e}")

    # --- Step F: Combine everything ---
    if all_data_frames:
        master_df = pd.concat(all_data_frames, ignore_index=True)
        print("-" * 40)
        print(f"Processing complete.")
        print(f"Total files merged: {files_found}")
        print(f"Total combined rows: {len(master_df)}")
        
        # Optional: Print first few rows to verify
        # print(tabulate(master_df.head(), headers='keys', tablefmt='psql'))
    else:
        print("No valid data collected from any files.")

Scanning folder: C:\Users\Ravi Pal\my_projects\project_p767\Taxi_management_db\data\manual_operation_data...

Processed: TRG PICKUP 01-12-2025.xlsx | Rows: 81
Processed: TRG PICKUP 02-12-2025.xlsx | Rows: 46
Processed: TRG PICKUP 03-12-2025.xlsx | Rows: 81
Processed: TRG PICKUP 04-12-2025.xlsx | Rows: 106
Processed: TRG PICKUP 05-12-2025.xlsx | Rows: 60
Processed: TRG PICKUP 06-12-2025.xlsx | Rows: 34
Processed: TRG PICKUP 08-12-2025.xlsx | Rows: 96
----------------------------------------
Processing complete.
Total files merged: 7
Total combined rows: 504


  df = df.replace(r'^\s*$', np.nan, regex=True)
