In [8]:
import os
import glob
import json
import pandas as pd
from flatten_json import flatten
import re
import geopandas as gpd
import geopandas as gpd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [9]:
folder_path = r'D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output'

# Function to extract the 'yyyy-mm' from the filename
def extract_timeperiod(filename):
    basename = os.path.basename(filename)
    return basename[:7]  # Assumes the format 'yyyy-mm' at the start of the filename

In [None]:
# Function to flatten the JSON into a dataframe
def extract_json(data, timeperiod, file_path):
    try:
        # Normalize specific fields from JSON and create a dataframe
        data_file = json.load(data)

        affected_pop = pd.DataFrame(data_file["affectedPopulation"])

        data_list = []

        # Iterate over each entry in the 'details' Series
        for index, row in affected_pop.iterrows():
            district = row['district']  # Capture the district for the current row
            details = row['details']  # Capture the details for the current row
            
            # Find matches for the current details string
            match = re.findall(r'\(\s*(.*?)\s*\)', details)
            
            for m in match:
                try:
                    # Split the extracted string by ' | ' to get the revenue circle, population affected, and crop area
                    revenue_circle, population_affected, crop_area = m.split(' | ')
                    population_affected = int(re.search(r'\d+', population_affected).group())
                    crop_area = float(re.search(r'\d+(\.\d+)?', crop_area).group())
                    
                    # Append the results as a dictionary to the list, including the correct district
                    data_list.append({
                        "district": district,  # Add the correct district value here
                        "revenue_circle": revenue_circle.strip(),
                        "Population Affected": population_affected,
                        "Crop Area": crop_area
                    })
                except Exception as e:
                    print("Error processing match:", m, e)

        # Create a DataFrame from the collected data
        pop_crop = pd.DataFrame(data_list)

        # human lives lost
        hll = data_file["hllDetails"]
        df = pd.DataFrame(hll)

        # Function to extract the details column into a structured DataFrame
        def extract_details(df_column, lives_lost_col):
            extracted_data = []
            
            for row in df_column:
                district = row['district']
                details = row['details']
                # Extract the revenue circle and lives lost information
                for match in details.split('), '):
                    match = match.strip('()')
                    revenue_circle, lives_lost = match.split(' | ')
                    extracted_data.append({
                        'district': district,
                        'revenue_circle': revenue_circle.strip(),
                        lives_lost_col: int(lives_lost)
                    })
            
            return pd.DataFrame(extracted_data)

        # Extract lives lost for both 'confirmed' and 'missing'
        df_confirmed = extract_details(df['confirmed'], 'lives_lost_confirmed')
        df_missing = extract_details(df['missing'], 'lives_lost_missing')

        # Merge the two DataFrames on 'district' and 'revenue_circle'
        hll_final = pd.merge(df_confirmed, df_missing, on=['district', 'revenue_circle'], how='outer')

        # INFRASTRUCTURE
        inf = data_file["infDamageDetails"]
        
        extracted_data = []

        # Define the list of indicator types to iterate over
        indicator_types = ["embBreached", "embAffected", "bridgeAffected", "roadAffected"]

        # Iterate through each indicator type in the data
        for indicator in indicator_types:
            if indicator in inf:
                for entry in inf[indicator]:
                    district = entry['district']
                    for detail in entry['details']:
                        # Extract the block name and clean up the indicator value (remove parentheses)
                        revenue_circle = detail['block'].split('|')[0].strip('() ')
                        indicator_value = detail['block'].split('|')[1].strip('() ')
                        
                        # Convert the cleaned value to an integer
                        indicator_value = int(indicator_value)
                        
                        # Append a dictionary with the extracted data
                        extracted_data.append({
                            'district': district,
                            'revenue_circle': revenue_circle,
                            'indicator': indicator,
                            'value': indicator_value
                        })

        # Convert the list to a DataFrame
        inf_dmg = pd.DataFrame(extracted_data)

        # Pivot the table to get one row per block, and each indicator as a separate column
        inf_dmg = inf_dmg.pivot_table(index=['district', 'revenue_circle'], columns='indicator', values='value', fill_value=0).reset_index()

        # Flatten the multi-level column index after pivot
        inf_dmg.columns.name = None  # Remove the index name from columns
        inf_dmg.columns = [col if isinstance(col, str) else col[1] for col in inf_dmg.columns]
        
        # Combine all dataframes into one
        combined_df = pd.merge(pop_crop, hll_final, on=['revenue_circle', 'district'])
        final_df = pd.merge(combined_df, inf_dmg, on=['revenue_circle', 'district'])
        final_df['timeperiod'] = timeperiod
        
        return final_df

    except KeyError as e:
        print(f"Warning: Missing key {e} in file {file_path}. Skipping this file.")
        return None

# Example usage
result = extract_json(data, timeperiod, file_path)
if result is not None:
    all_dataframes.append(result)


In [13]:
# Function to flatten the JSON into a dataframe
def extract_json(data, timeperiod):
    # Normalize specific fields from JSON and create a dataframe
    data_file = json.load(data)

    affected_pop = pd.DataFrame(data_file["affectedPopulation"])

    data_list = []

    # Iterate over each entry in the 'details' Series
    for index, row in affected_pop.iterrows():
        district = row['district']  # Capture the district for the current row
        details = row['details']  # Capture the details for the current row
        
        # Find matches for the current details string
        match = re.findall(r'\(\s*(.*?)\s*\)', details)
        
        for m in match:
            try:
                # Split the extracted string by ' | ' to get the revenue circle, population affected, and crop area
                revenue_circle, population_affected, crop_area = m.split(' | ')
                population_affected = int(re.search(r'\d+', population_affected).group())
                crop_area = float(re.search(r'\d+(\.\d+)?', crop_area).group())
                
                # Append the results as a dictionary to the list, including the correct district
                data_list.append({
                    "district": district,  # Add the correct district value here
                    "revenue_circle": revenue_circle.strip(),
                    "Population Affected": population_affected,
                    "Crop Area": crop_area
                })
            except Exception as e:
                print("Error processing match:", m, e)

            # Create a DataFrame from the collected data
    pop_crop = pd.DataFrame(data_list)
    #pop_extracted

    # human lives lost
    hll = data_file["hllDetails"]
    df = pd.DataFrame(hll)

    # Function to extract the details column into a structured DataFrame
    def extract_details(df_column, lives_lost_col):
        extracted_data = []
        
        for row in df_column:
            district = row['district']
            details = row['details']
            # Extract the revenue circle and lives lost information
            for match in details.split('), '):
                match = match.strip('()')
                revenue_circle, lives_lost = match.split(' | ')
                extracted_data.append({
                    'district': district,
                    'revenue_circle': revenue_circle.strip(),
                    lives_lost_col: int(lives_lost)
                })
        
        return pd.DataFrame(extracted_data)

    # Extract lives lost for both 'confirmed' and 'missing'
    df_confirmed = extract_details(df['confirmed'], 'lives_lost_confirmed')
    df_missing = extract_details(df['missing'], 'lives_lost_missing')

    # Merge the two DataFrames on 'district' and 'revenue_circle'
    hll_final = pd.merge(df_confirmed, df_missing, on=['district', 'revenue_circle'], how='outer')

    # INFRASTRUCTURE
    inf = data_file["infDamageDetails"]
    
    #inf = data_file["infDamageDetails"]
    extracted_data = []

    # Define the list of indicator types to iterate over
    indicator_types = ["embBreached", "embAffected", "bridgeAffected", "roadAffected"]

    # Iterate through each indicator type in the data
    for indicator in indicator_types:
        if indicator in inf:
            for entry in inf[indicator]:
                district = entry['district']
                for detail in entry['details']:
                    # Extract the block name and clean up the indicator value (remove parentheses)
                    revenue_circle = detail['block'].split('|')[0].strip('() ')
                    indicator_value = detail['block'].split('|')[1].strip('() ')
                    
                    # Convert the cleaned value to an integer
                    indicator_value = int(indicator_value)
                    
                    # Append a dictionary with the extracted data
                    extracted_data.append({
                        'district': district,
                        'revenue_circle': revenue_circle,
                        'indicator': indicator,
                        'value': indicator_value
                    })

    # Convert the list to a DataFrame
    inf_dmg = pd.DataFrame(extracted_data)

    # Pivot the table to get one row per block, and each indicator as a separate column
    inf_dmg = inf_dmg.pivot_table(index=['district', 'revenue_circle'], columns='indicator', values='value', fill_value=0).reset_index()

    # Flatten the multi-level column index after pivot
    inf_dmg.columns.name = None  # Remove the index name from columns
    inf_dmg.columns = [col if isinstance(col, str) else col[1] for col in inf_dmg.columns]
    
    # Add the timeperiod column to each dataframe

    # Combine all dataframes into one
    combined_df = pd.merge(pop_crop,hll_final,on=['revenue_circle','district'])
    final_df = pd.merge(combined_df,inf_dmg,on=['revenue_circle','district'])
    final_df['timeperiod']=timeperiod
    #combined_df = pd.concat([pop_crop, hll_final, inf_dmg,], ignore_index=True)
    
    return final_df


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [15]:
all_dataframes = []

# Process each JSON file in the folder
for file_path in glob.glob(os.path.join(folder_path, '*.json')):
    with open(file_path, 'r') as f:
        data = json.load(f)
    data = open(file_path)
    print(file_path)
    timeperiod = extract_timeperiod(file_path)
    # Extract timeperiod from the filename
    
    # Flatten the JSON and process it
    df = extract_json(data, timeperiod, file_path)
    
    # Append the dataframe to the list
    all_dataframes.append(df)

combined_df = pd.concat(all_dataframes, ignore_index=True)

# Save the combined dataframe to a CSV file (optional)
combined_df.to_csv(r'D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\combined_loss_damage_data.csv', index=False)

D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2021_05.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2021_06.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2021_07.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2021_08.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2021_09.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2021_10.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2022_04.json
D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\DRIMS_api_output\2022_

In [6]:
combined_df

Unnamed: 0,district,revenue_circle,Population Affected,Crop Area,lives_lost_confirmed,lives_lost_missing,bridgeAffected,embAffected,embBreached,roadAffected,timeperiod
0,Bajali,Bajali,161000,1216.00,3,0,31.0,3.0,3.0,63.0,2022_06
1,Bajali,Sarupeta,195000,857.00,0,0,2.0,2.0,2.0,88.0,2022_06
2,Baksa,Baska,2209,35.07,0,0,0.0,0.0,0.0,23.0,2022_06
3,Baksa,Jalah,12645,220.00,0,0,0.0,0.0,0.0,0.0,2022_06
4,Baksa,Barama,1507,68.00,0,0,0.0,1.0,1.0,2.0,2022_06
...,...,...,...,...,...,...,...,...,...,...,...
997,Lakhimpur,Bihpuria,0,0.00,0,0,0.0,0.0,0.0,0.0,2024_09
998,Lakhimpur,Narayanpur,4,0.00,0,0,0.0,0.0,1.0,0.0,2024_09
999,Sivasagar,Demow,0,76.50,0,0,0.0,0.0,0.0,0.0,2024_09
1000,Sivasagar,Sivsagar,0,89.00,0,0,0.0,0.0,0.0,0.0,2024_09


Variable export

In [16]:
rc_gdf = gpd.read_file(r'D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Maps\Geojson\assam_rc_2024-11.geojson')

In [17]:
rc_gdf

Unnamed: 0,revenue_ci,revenue_cr,HQ,are_new,dtname,object_id,dtcode11,geometry
0,Gossaigaon (Pt),Gossaigaon (Pt),,1069,KOKRAJHAR,18-300-00101,18-300,"MULTIPOLYGON (((90.14118 26.74010, 90.15342 26..."
1,Bhowraguri,Bhawraguri,,159,KOKRAJHAR,18-300-00102,18-300,"MULTIPOLYGON (((90.09811 26.46455, 90.10012 26..."
2,Dotoma,Dotoma,,304,KOKRAJHAR,18-300-00103,18-300,"MULTIPOLYGON (((90.19819 26.63409, 90.21635 26..."
3,Kokrajhar (Pt),Kokrajhar (Pt),y,990,KOKRAJHAR,18-300-00104,18-300,"MULTIPOLYGON (((90.33590 26.86280, 90.33610 26..."
4,Bagribari (Pt),Bagribari (Pt),,281,KOKRAJHAR,18-300-00105,18-300,"MULTIPOLYGON (((89.99553 26.35026, 89.99717 26..."
...,...,...,...,...,...,...,...,...
175,Sapekhati,Sapekhati,,394,CHARAIDEO,18-755-00278,18-755,"MULTIPOLYGON (((95.10936 27.12364, 95.10927 27..."
176,Sonari,Sonari,y,385,CHARAIDEO,18-755-00279,18-755,"MULTIPOLYGON (((94.95742 27.06354, 94.95822 27..."
177,Ujani Majuli,Ujani Majuli,,322,MAJULI,18-760-00280,18-760,"MULTIPOLYGON (((94.33804 26.89186, 94.33850 26..."
178,Majuli,Majuli,,648,MAJULI,18-760-00281,18-760,"MULTIPOLYGON (((94.56453 27.18172, 94.56755 27..."


In [18]:
combined_df = combined_df.rename(columns={
    'Population Affected': 'Population_affected_Total',
    'Crop Area': 'Crop_Area',
    'bridgeAffected': 'Bridge',
    'lives_lost_confirmed': 'Human_Live_Lost',
    'lives_lost_missing': 'Human_Live_Missing',
    'embBreached': 'Embankment breached',
    'embAffected': 'Embankments affected',
    'roadAffected': 'Roads'})

In [19]:
combined_df

Unnamed: 0,district,revenue_circle,Population_affected_Total,Crop_Area,Human_Live_Lost,Human_Live_Missing,Bridge,Embankments affected,Embankment breached,Roads,timeperiod
0,Barpeta,Kalgachia,0,1.73,0,0,0.0,0.0,0.0,0.0,2021_06
1,Barpeta,Barpeta,0,0.00,1,0,0.0,0.0,0.0,0.0,2021_06
2,Barpeta,Chenga,0,0.00,1,0,0.0,0.0,0.0,0.0,2021_06
3,Biswanath,Halem,11,35.50,0,0,0.0,2.0,2.0,2.0,2021_06
4,Biswanath,Gohpur,0,8.50,0,0,0.0,0.0,1.0,0.0,2021_06
...,...,...,...,...,...,...,...,...,...,...,...
1420,Lakhimpur,Bihpuria,0,0.00,0,0,0.0,0.0,0.0,0.0,2024_09
1421,Lakhimpur,Narayanpur,4,0.00,0,0,0.0,0.0,1.0,0.0,2024_09
1422,Sivasagar,Demow,0,76.50,0,0,0.0,0.0,0.0,0.0,2024_09
1423,Sivasagar,Sivsagar,0,89.00,0,0,0.0,0.0,0.0,0.0,2024_09


In [20]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()

    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m

    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2

    return df_1

In [21]:
fuzzymatch = fuzzy_merge(rc_gdf, combined_df, 'revenue_ci', 'revenue_circle', threshold=80,limit=1)
fuzzymatch

Unnamed: 0,revenue_ci,revenue_cr,HQ,are_new,dtname,object_id,dtcode11,geometry,matches
0,Gossaigaon (Pt),Gossaigaon (Pt),,1069,KOKRAJHAR,18-300-00101,18-300,"MULTIPOLYGON (((90.14118 26.74010, 90.15342 26...",Gossaigaon
1,Bhowraguri,Bhawraguri,,159,KOKRAJHAR,18-300-00102,18-300,"MULTIPOLYGON (((90.09811 26.46455, 90.10012 26...",Bhawraguri
2,Dotoma,Dotoma,,304,KOKRAJHAR,18-300-00103,18-300,"MULTIPOLYGON (((90.19819 26.63409, 90.21635 26...",Dotma
3,Kokrajhar (Pt),Kokrajhar (Pt),y,990,KOKRAJHAR,18-300-00104,18-300,"MULTIPOLYGON (((90.33590 26.86280, 90.33610 26...",Kokrajhar
4,Bagribari (Pt),Bagribari (Pt),,281,KOKRAJHAR,18-300-00105,18-300,"MULTIPOLYGON (((89.99553 26.35026, 89.99717 26...",Bagribari
...,...,...,...,...,...,...,...,...,...
175,Sapekhati,Sapekhati,,394,CHARAIDEO,18-755-00278,18-755,"MULTIPOLYGON (((95.10936 27.12364, 95.10927 27...",Sapekhati
176,Sonari,Sonari,y,385,CHARAIDEO,18-755-00279,18-755,"MULTIPOLYGON (((94.95742 27.06354, 94.95822 27...",Sonari
177,Ujani Majuli,Ujani Majuli,,322,MAJULI,18-760-00280,18-760,"MULTIPOLYGON (((94.33804 26.89186, 94.33850 26...",Ujani Majuli
178,Majuli,Majuli,,648,MAJULI,18-760-00281,18-760,"MULTIPOLYGON (((94.56453 27.18172, 94.56755 27...",Majuli


In [22]:
combined_df['district_2'] = combined_df['district'].str.upper()
combined_df

Unnamed: 0,district,revenue_circle,Population_affected_Total,Crop_Area,Human_Live_Lost,Human_Live_Missing,Bridge,Embankments affected,Embankment breached,Roads,timeperiod,district_2
0,Barpeta,Kalgachia,0,1.73,0,0,0.0,0.0,0.0,0.0,2021_06,BARPETA
1,Barpeta,Barpeta,0,0.00,1,0,0.0,0.0,0.0,0.0,2021_06,BARPETA
2,Barpeta,Chenga,0,0.00,1,0,0.0,0.0,0.0,0.0,2021_06,BARPETA
3,Biswanath,Halem,11,35.50,0,0,0.0,2.0,2.0,2.0,2021_06,BISWANATH
4,Biswanath,Gohpur,0,8.50,0,0,0.0,0.0,1.0,0.0,2021_06,BISWANATH
...,...,...,...,...,...,...,...,...,...,...,...,...
1420,Lakhimpur,Bihpuria,0,0.00,0,0,0.0,0.0,0.0,0.0,2024_09,LAKHIMPUR
1421,Lakhimpur,Narayanpur,4,0.00,0,0,0.0,0.0,1.0,0.0,2024_09,LAKHIMPUR
1422,Sivasagar,Demow,0,76.50,0,0,0.0,0.0,0.0,0.0,2024_09,SIVASAGAR
1423,Sivasagar,Sivsagar,0,89.00,0,0,0.0,0.0,0.0,0.0,2024_09,SIVASAGAR


In [23]:
rc_complete_matched = fuzzymatch.merge(combined_df, left_on=['matches','dtname'], right_on=['revenue_circle','district_2'], how='outer')#.to_csv('frims_rc_id_mapping.csv', index=False)
rc_complete_matched = rc_complete_matched[['revenue_ci', 'object_id', 'dtname', #'revenue_cr',
       #'HQ', 'area', 'are_new', 'geometry', 'matches', 'district','revenue_circle', 
       'Population_affected_Total', 'Crop_Area',
       'timeperiod', 'Bridge', 'Embankment breached', 'Embankments affected',
       'Roads', 'Human_Live_Lost'
       ]]
df_cleaned=rc_complete_matched.dropna(subset=['timeperiod','object_id'])
df_cleaned = df_cleaned[['revenue_ci','object_id','dtname','timeperiod','Bridge','Embankment breached','Embankments affected','Roads','Human_Live_Lost','Population_affected_Total', 'Crop_Area']]
df_cleaned = df_cleaned.rename(columns={'district_2':'DISTRICT'})
df_cleaned

Unnamed: 0,revenue_ci,object_id,dtname,timeperiod,Bridge,Embankment breached,Embankments affected,Roads,Human_Live_Lost,Population_affected_Total,Crop_Area
13,Agamoni,18-301-00110,DHUBRI,2022_06,0.0,0.0,0.0,9.0,0.0,1190.0,893.5
14,Agamoni,18-301-00110,DHUBRI,2022_07,0.0,0.0,0.0,2.0,0.0,0.0,0.0
15,Agamoni,18-301-00110,DHUBRI,2022_08,0.0,0.0,0.0,0.0,0.0,0.0,35.5
16,Agamoni,18-301-00110,DHUBRI,2023_06,0.0,0.0,0.0,0.0,0.0,576.0,128.5
17,Agamoni,18-301-00110,DHUBRI,2023_07,0.0,0.0,0.0,3.0,0.0,3336.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...
1463,Ujani Majuli,18-760-00280,MAJULI,2023_07,0.0,0.0,3.0,3.0,0.0,0.0,137.0
1464,Ujani Majuli,18-760-00280,MAJULI,2023_08,0.0,0.0,0.0,3.0,0.0,12852.0,443.0
1465,Ujani Majuli,18-760-00280,MAJULI,2023_09,0.0,0.0,0.0,0.0,0.0,0.0,269.0
1466,Ujani Majuli,18-760-00280,MAJULI,2024_06,0.0,0.0,0.0,0.0,0.0,12363.0,0.0


In [24]:
indicator_columns = ['Bridge', 'Embankment breached', 'Embankments affected', 'Roads', 
                     'Human_Live_Lost', 
                     'Population_affected_Total', 'Crop_Area']
                     #'Total_House_Fully_Damaged', 'Total_Animal_Affected']

variable_path = r'D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\DRIMS\data\variables/'

# Step 2: Loop through each indicator column
for indicator in indicator_columns:
    # Step 3: Get unique time periods
    for timeperiod in df_cleaned['timeperiod'].unique():
        # Step 4: Filter the DataFrame by time period and the current indicator column
        filtered_df = df_cleaned[['object_id', indicator]][df_cleaned['timeperiod'] == timeperiod]
        # Step 5: Create directory if it doesn't exist
        if not os.path.exists(indicator):
            os.makedirs(indicator)
        
        # Step 6: Construct file name and save the filtered DataFrame to CSV
        filename = f"{indicator}_{timeperiod}.csv"
        file_path = os.path.join(variable_path,indicator, filename)
        filtered_df.to_csv(file_path, index=False)
