In [30]:
import pandas as pd
import os
import glob


In [31]:
district_codes = {
    'DHUBRI': '2011Dhub',
    'SOUTH SALMARA MANCACHAR': '2019Sout',
    'MORIGAON': '2011Mori',
    'TINSUKIA': '2011Tins',
    'CHARAIDEO': '2019Char',
    'GOLAGHAT': '2011Gola',
    'DHEMAJI': '2011Dhem',
    'K.ANGLONG': '2011Karb',
    'KOKRAJHAR': '2011Kokr',
    'BONGAIGAON': '2011Bong',
    'NAGAON': '2011Naga',
    'HAILAKANDI': '2011Hail',
    'CHIRANG': '2011Chir',
    'WEST KARBI ANGLONG': '2019West',
    'LAKHIMPUR': '2011Lakh',
    'GOALPARA': '2011Goal',
    'CACHAR': '2011Cach',
    'MAJULI': '2019Maju',
    'NALBARI': '2011Nalb',
    'DIMA HASAO': '2011Dima',
    'UDALGURI': '2011Udal',
    'JORHAT': '2011Jorh',
    'HOJAI': '2019Hoja',
    'BISWANATH': '2019Bisw',
    'SONITPUR': '2011Soni',
    'DARRANG': '2011Darr',
    'DIBRUGARH': '2011Dibr',
    'KARIMGANJ': '2011Kari',
    'KAMRUP METRO': '2011KamrM',
    'KAMRUP': '2011Kamr',
    'SIVSAGAR': '2011Siva',
    'BARPETA': '2011Barp',
    'BAJALI': '2022Baja',
    'TAMULPUR': '2011Tamu',
    'BAKSA': '2011Baks'
}

In [38]:
import pandas as pd
import numpy as np
import os
import glob
import re
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(filename='flood_data_consolidation.log', level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
logging.getLogger().addHandler(console_handler)

district_codes = {
    'DHUBRI': '2011Dhub', 'SOUTH SALMARA MANCACHAR': '2019Sout', 'MORIGAON': '2011Mori',
    'TINSUKIA': '2011Tins', 'CHARAIDEO': '2019Char', 'GOLAGHAT': '2011Gola',
    'DHEMAJI': '2011Dhem', 'K.ANGLONG': '2011Karb', 'KOKRAJHAR': '2011Kokr',
    'BONGAIGAON': '2011Bong', 'NAGAON': '2011Naga', 'HAILAKANDI': '2011Hail',
    'CHIRANG': '2011Chir', 'WEST KARBI ANGLONG': '2019West', 'LAKHIMPUR': '2011Lakh',
    'GOALPARA': '2011Goal', 'CACHAR': '2011Cach', 'MAJULI': '2019Maju',
    'NALBARI': '2011Nalb', 'DIMA HASAO': '2011Dima', 'UDALGURI': '2011Udal',
    'JORHAT': '2011Jorh', 'HOJAI': '2019Hoja', 'BISWANATH': '2019Bisw',
    'SONITPUR': '2011Soni', 'DARRANG': '2011Darr', 'DIBRUGARH': '2011Dibr',
    'KARIMGANJ': '2011Kari', 'KAMRUP METRO': '2011KamrM', 'KAMRUP': '2011Kamr',
    'SIVASAGAR': '2011Siva', 'BARPETA': '2011Barp', 'BAJALI': '2022Baja',
    'TAMULPUR': '2011Tamu', 'BAKSA': '2011Baks', 'KARBI ANGLONG WEST': '2019West'
}

# Load the revenue circle to district mapping
try:
    revenue_circle_mapping = pd.read_csv('/home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data/Assam_Maps/ASSAM_REVENUE_CIRCLES_FRIMS_NAMES.csv')
    logging.info("Revenue circle mapping loaded successfully")
    revenue_circle_dict = dict(zip(revenue_circle_mapping['revenue_ci'], revenue_circle_mapping['district_35']))
except Exception as e:
    logging.error(f"Error loading revenue circle mapping: {str(e)}")
    revenue_circle_dict = {}

def clean_text(text):
    if pd.isna(text) or text == '':
        return ''
    text = str(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def safe_upper(x):
    return x.upper() if isinstance(x, str) else x

def process_dataframe(df):
    try:
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].apply(clean_text)
        return df
    except Exception as e:
        logging.error(f"Error processing dataframe: {str(e)}")
        return None

def consolidate_data(input_directory, output_directory, damage_type):
    try:
        files = glob.glob(os.path.join(input_directory, f"{damage_type}_*.csv"))
        logging.info(f"Found {len(files)} files for {damage_type}")

        if not files:
            logging.warning(f"No files found for damage type: {damage_type}")
            return

        all_data = []
        for file in files:
            try:
                date = datetime.strptime(os.path.basename(file).split('_')[2].split('.')[0], '%Y-%m-%d')
                df = pd.read_csv(file)
                logging.info(f"Read file: {file}, shape: {df.shape}")
                df = process_dataframe(df)
                if df is not None:
                    df['Date'] = date
                    all_data.append(df)
            except Exception as e:
                logging.error(f"Error processing file {file}: {str(e)}")

        if not all_data:
            logging.warning(f"No valid data found for damage type: {damage_type}")
            return

        consolidated_df = pd.concat(all_data, ignore_index=True)
        logging.info(f"Consolidated data shape: {consolidated_df.shape}")

        consolidated_df['state'] = 'ASSAM'
        consolidated_df['ID'] = consolidated_df.groupby(['Date', 'District']).cumcount() + 1
        consolidated_df['District'] = consolidated_df['District'].apply(safe_upper)
        consolidated_df['assam_dist'] = consolidated_df['District'].map(district_codes)

        if 'Number' not in consolidated_df.columns:
            consolidated_df['Number'] = 1

        if 'Revenue Circle' in consolidated_df.columns:
            consolidated_df['District'] = consolidated_df['Revenue Circle'].map(revenue_circle_dict).fillna(consolidated_df['District'])

        detail_columns = [col for col in consolidated_df.columns if col not in ['Date', 'District', 'Number', 'state', 'ID', 'assam_dist', 'Revenue Circle']]

        consolidated_df['Details'] = consolidated_df[detail_columns].apply(
            lambda row: ' | '.join([f"{col}: {val}" for col, val in row.items() if pd.notna(val) and val != '']), 
            axis=1
        )

        # District Wise consolidation
        district_df = consolidated_df.groupby(['Date', 'District']).agg({
            'Number': 'sum',
            'Details': lambda x: ' | '.join(x),
            'state': 'first',
            'assam_dist': 'first'
        }).reset_index()

        district_df['ID'] = district_df.groupby('Date').cumcount() + 1

        district_output_columns = ['Date', 'District', 'Number', 'Details', 'state', 'ID', 'assam_dist']
        district_output_df = district_df[district_output_columns]

        district_output_file = os.path.join(output_directory, f"DISTRICT_WISE_DRIMS_{damage_type}_MASTER_2024.csv")
        district_output_df.to_csv(district_output_file, index=False)
        logging.info(f"Created {district_output_file}")

        # Revenue Circle Wise consolidation (if applicable)
        if 'Revenue Circle' in consolidated_df.columns:
            revenue_df = consolidated_df.groupby(['Date', 'District', 'Revenue Circle']).agg({
                'Number': 'sum',
                'Details': lambda x: ' | '.join(x),
                'state': 'first',
                'assam_dist': 'first'
            }).reset_index()

            revenue_df['ID'] = revenue_df.groupby(['Date', 'District']).cumcount() + 1

            revenue_output_columns = ['Date', 'District', 'Revenue Circle', 'Number', 'Details', 'state', 'ID', 'assam_dist']
            revenue_output_df = revenue_df[revenue_output_columns]

            revenue_output_file = os.path.join(output_directory, f"REVENUE_CIRCLE_WISE_DRIMS_{damage_type}_MASTER_2024.csv")
            revenue_output_df.to_csv(revenue_output_file, index=False)
            logging.info(f"Created {revenue_output_file}")

    except Exception as e:
        logging.error(f"Error consolidating data for {damage_type}: {str(e)}")

def main():
    input_directory = "/home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data"
    output_directory = "/home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Consolidated Data"

    try:
        os.makedirs(output_directory, exist_ok=True)
        logging.info(f"Output directory created/verified: {output_directory}")
    except Exception as e:
        logging.error(f"Error creating output directory: {str(e)}")
        return

    damage_types = [
        'BRIDGES_DAMAGED',
        'EMBANKMENT_AFFECTED',
        'EMBANKMENT_BREACHED',
        'ROADS_DAMAGED',
        'HUMAN_LIFE_LOST',
        'POPULATION_AFFECTED',
        'VILLAGES_AFFECTED',
        'ANIMALS_AFFECTED'
    ]

    for damage_type in damage_types:
        logging.info(f"Processing damage type: {damage_type}")
        consolidate_data(input_directory, output_directory, damage_type)

    logging.info("Data consolidation completed.")

if __name__ == "__main__":
    main()

Revenue circle mapping loaded successfully
Output directory created/verified: /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Consolidated Data


Processing damage type: BRIDGES_DAMAGED
Found 83 files for BRIDGES_DAMAGED
Error processing file /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/BRIDGES_DAMAGED_2024-06-23.csv: time data 'DAMAGED' does not match format '%Y-%m-%d'
Error processing file /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/BRIDGES_DAMAGED_2024-06-27.csv: time data 'DAMAGED' does not match format '%Y-%m-%d'
Error processing file /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/BRIDGES_DAMAGED_2024-06-14.csv: time data 'DAMAGED' does not match format '%Y-%m-%d'
Error processing file /home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/Data_2024/Scraped Data/BRIDGES_DAMAGED_2024-08-06.csv: time data 'DAMAGED' does not match format '%Y-%m-%d'
Error processing file /home/prajna/civicdatalab/IDS-DRR-Assam/Arc

In [36]:
if __name__ == "__main__":
    main()