In [2]:
import pandas as pd
import re
import os
import logging
from datetime import datetime

logging.basicConfig(filename='flood_data_extraction.log', level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def extract_date(filename):
    match = re.search(r'(\d{2}\.\d{2}\.\d{4})', filename)
    if match:
        return datetime.strptime(match.group(1), '%d.%m.%Y').strftime('%Y-%m-%d')
    return None

def parse_complex_cell(cell):
    if isinstance(cell, str) and '|' in cell:
        parts = re.findall(r'\(([^|]+)\s*\|\s*(\d+)\)', cell)
        return {name.strip(): int(count) for name, count in parts}
    return cell

def process_csv(csv_file):
    try:
        df = pd.read_csv(csv_file, header=None, skip_blank_lines=True)
    except FileNotFoundError:
        logging.error(f"CSV file not found: {csv_file}")
        return None
    except pd.errors.EmptyDataError:
        logging.error(f"CSV file is empty: {csv_file}")
        return None
    except Exception as e:
        logging.error(f"Error reading CSV file {csv_file}: {str(e)}")
        return None

    data = {}
    current_section = None

    try:
        for _, row in df.iterrows():
            if pd.notna(row[0]) and isinstance(row[0], str) and not row[0].isdigit():
                current_section = clean_text(row[0])
                data[current_section] = []
            elif current_section and pd.notna(row[1]):
                cleaned_row = [clean_text(str(cell)) if pd.notna(cell) else '' for cell in row]
                parsed_row = [parse_complex_cell(cell) for cell in cleaned_row]
                data[current_section].append(parsed_row)
    except Exception as e:
        logging.error(f"Error processing data in {csv_file}: {str(e)}")
        return None

    logging.info(f"Successfully processed CSV file: {csv_file}")
    return data

def extract_with_flexible_columns(data, section_name, default_headers):
    section_data = data.get(section_name, [])
    if not section_data:
        return pd.DataFrame()
    
    df = pd.DataFrame(section_data[1:])
    
    if len(df.columns) > len(default_headers):
        df = df.iloc[:, :len(default_headers)]
    
    df.columns = default_headers[:len(df.columns)]
    
    return df

def extract_bridges_damaged(data):
    default_headers = ['District', 'Number', 'Revenue Circle', 'Bridge Name', 'Department', 'Village', 'Location', 'Longitude', 'Latitude', 'Remarks']
    return extract_with_flexible_columns(data, 'Infrastructure Damaged - Bridge', default_headers)

def extract_embankment_affected(data):
    default_headers = ['District', 'Number', 'Revenue Circle', 'Embankment Affected Name', 'Department', 'Village', 'Location', 'Longitude', 'Latitude', 'Remarks']
    return extract_with_flexible_columns(data, 'Infrastructure Damaged - Embankment Affected', default_headers)

def extract_embankment_breached(data):
    default_headers = ['District', 'Number', 'Revenue Circle', 'Embankment Breached Name', 'Department', 'Village', 'Location', 'Longitude', 'Latitude', 'Remarks']
    return extract_with_flexible_columns(data, 'Infrastructure Damaged - Embankment Breached', default_headers)

def extract_roads_damaged(data):
    default_headers = ['District', 'Number', 'Revenue Circle', 'Road Name', 'Department', 'Village', 'Location', 'Longitude', 'Latitude', 'Remarks']
    return extract_with_flexible_columns(data, 'Infrastructure Damaged - Road', default_headers)

def extract_human_life_lost(data):
    default_headers = ['District', 'Total', 'Flood Death', 'General Drowning (Non Flood)', 'Male', 'Female', 'Children Male', 'Children Female', 'Others', 'Revenue Circlewise']
    return extract_with_flexible_columns(data, 'Human Lives Lost - Confirmed', default_headers)

def extract_population_affected(data):
    default_headers = ['District', 'Male', 'Female', 'Children', 'Total Population', 'Total Crop Area (in Hect.)', 'Population and Crop Area Details']
    return extract_with_flexible_columns(data, 'Population And Crop Area Affected', default_headers)

def extract_villages_affected(data):
    default_headers = ['District', 'Total', 'Revenue Circle']
    return extract_with_flexible_columns(data, 'Villages Affected', default_headers)

def extract_animals_affected(data):
    default_headers = ['District', 'Total', 'Big', 'Small', 'Poultry']
    return extract_with_flexible_columns(data, 'Animals Affected', default_headers)

def get_csv_files(directory):
    try:
        return [f for f in os.listdir(directory) if f.endswith('.csv') and f.startswith('DRIMS')]
    except FileNotFoundError:
        logging.error(f"Directory not found: {directory}")
        return []
    except PermissionError:
        logging.error(f"Permission denied to access directory: {directory}")
        return []
    except Exception as e:
        logging.error(f"Error accessing directory {directory}: {str(e)}")
        return []

def main(csv_directory, output_directory):
    csv_files = get_csv_files(csv_directory)

    if not csv_files:
        logging.error(f"No matching CSV files found in the directory: {csv_directory}")
        print(f"No matching CSV files found in the directory: {csv_directory}")
        return

    logging.info(f"Found {len(csv_files)} CSV files to process.")
    print(f"Found {len(csv_files)} CSV files to process.")

    if not os.path.exists(output_directory):
        try:
            os.makedirs(output_directory)
            logging.info(f"Created output directory: {output_directory}")
            print(f"Created output directory: {output_directory}")
        except Exception as e:
            logging.error(f"Failed to create output directory: {output_directory}. Error: {str(e)}")
            print(f"Failed to create output directory: {output_directory}. Error: {str(e)}")
            return

    for csv_file in csv_files:
        full_path = os.path.join(csv_directory, csv_file)
        logging.info(f"Processing file: {full_path}")
        print(f"Processing file: {full_path}")

        date = extract_date(csv_file)
        if not date:
            logging.warning(f"Could not extract date from filename: {csv_file}")
            print(f"Could not extract date from filename: {csv_file}")
            continue

        data = process_csv(full_path)
        if data is None:
            logging.warning(f"Skipping file due to processing error: {csv_file}")
            print(f"Skipping file due to processing error: {csv_file}")
            continue

        extractions = {
            'BRIDGES_DAMAGED': extract_bridges_damaged,
            'EMBANKMENT_AFFECTED': extract_embankment_affected,
            'EMBANKMENT_BREACHED': extract_embankment_breached,
            'ROADS_DAMAGED': extract_roads_damaged,
            'HUMAN_LIFE_LOST': extract_human_life_lost,
            'POPULATION_AFFECTED': extract_population_affected,
            'VILLAGES_AFFECTED': extract_villages_affected,
            'ANIMALS_AFFECTED': extract_animals_affected
        }

        for name, extract_func in extractions.items():
            try:
                df = extract_func(data)
                if not df.empty:
                    output_file = os.path.join(output_directory, f"{name}_{date}.csv")
                    df.to_csv(output_file, index=False)
                    logging.info(f"Created {output_file}")
                    print(f"Created {output_file}")
                else:
                    logging.warning(f"No data extracted for {name} from {csv_file}")
                    print(f"No data extracted for {name} from {csv_file}")
            except Exception as e:
                logging.error(f"Error extracting {name} from {csv_file}: {str(e)}")
                print(f"Error extracting {name} from {csv_file}: {str(e)}")

        try:
            relief_camps = data.get('Relief Camps / Centres Opened', [])
            if relief_camps:
                headers = ['District', 'Total', 'Relief Camp', 'Relief Distribution Centres']
                df_relief_camps = extract_with_flexible_columns(data, 'Relief Camps / Centres Opened', headers)
                output_file = os.path.join(output_directory, f"RELIEF_CAMPS_{date}.csv")
                df_relief_camps.to_csv(output_file, index=False)
                logging.info(f"Created {output_file}")
                print(f"Created {output_file}")
            else:
                logging.warning(f"No relief camps data found in {csv_file}")
                print(f"No relief camps data found in {csv_file}")
        except Exception as e:
            logging.error(f"Error processing relief camps data from {csv_file}: {str(e)}")
            print(f"Error processing relief camps data from {csv_file}: {str(e)}")

if __name__ == "__main__":
    csv_directory = "/home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/DRIMS_Reports_2024"
    output_directory = "/home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data"
    
    if not os.path.isdir(csv_directory):
        logging.error(f"The specified input directory does not exist: {csv_directory}")
        print(f"Error: The specified input directory does not exist: {csv_directory}")
    else:
        main(csv_directory, output_directory)

    logging.info("Script execution completed.")
    print("Script execution completed.")

Found 84 CSV files to process.
Processing file: /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/DRIMS_Reports_2024/DRIMS_05.07.2024.csv
Created /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/BRIDGES_DAMAGED_2024-07-05.csv
No data extracted for EMBANKMENT_AFFECTED from DRIMS_05.07.2024.csv
No data extracted for EMBANKMENT_BREACHED from DRIMS_05.07.2024.csv
Created /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/ROADS_DAMAGED_2024-07-05.csv
Created /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/HUMAN_LIFE_LOST_2024-07-05.csv
Created /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/POPULATION_AFFECTED_2024-07-05.csv
Created /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_202