In [1]:
import urllib.request
import camelot
import pandas as pd
import regex as re
import pygsheets
from datetime import date, timedelta, datetime
import numpy as np
import glob

from joblib import Parallel, delayed
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import geopandas as gpd

In [2]:
assam_districts = gpd.read_file('Data/Assam_Maps/assam_district_35.csv')


# Table of Contents:
* [Functions to be used](#functions)
* [Download PDFs](#download)
* [Scraper for infrastructure damage tables](#infradamages)

## Functions <a class="anchor" id="functions"></a>

In [3]:
#Sequence Matcher helps us get the metric that measures how two strings are matching
from difflib import SequenceMatcher

#We will write a function that gives us matching score between two strings a and b. Higher the score,better the match
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [4]:
# One FRIMS PDF has multiple tables that have to be scraped.
## The following functions are used to isolate the tables based on their categories. 
def get_table_start_index(FRIMS_DF, slug_list):
    '''
    :param FRIMS_DF: The FRIMS Data Frame of a particular date.
    :param slug_list: A list of keywords used to identify a particular table in the PDF.
    
    :return: Returns the index of the first row of the intended table.
    '''
    TABLE_START_INDEX = FRIMS_DF[FRIMS_DF.iloc[:,0].isin(slug_list)].index.values[0]
    return TABLE_START_INDEX

def get_table_end_index(FRIMS_DF, TABLE_START_INDEX):
    '''
    :param FRIMS_DF: The FRIMS Data Frame of a particular date.
    :param TABLE_START_INDEX: Once the index of a table's first row is found, it is passed into this function.
    
    :return: Returns the index of the last row of the intended table.
    '''
    for index,row in FRIMS_DF[TABLE_START_INDEX+1:].fillna('').iterrows():
        if row[0]=='':
            continue
        else:
            TABLE_END_INDEX = index
            return TABLE_END_INDEX
            break
    return TABLE_START_INDEX+100

In [5]:
def extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX):
    '''
    :param FRIMS_DF: The FRIMS Data Frame of a particular date.
    :param TABLE_START_INDEX: Once the index of a table's first row is found, it is passed into this function.
    :param TABLE_END_INDEX: Once the index of a table's last row is found, it is passed into this function.
    
    :return: Returns the filtered table between the indices passed, after cleaning it.
    '''
    FRIMS_INFRA_DAMAGES_DF = FRIMS_DF.loc[TABLE_START_INDEX:TABLE_END_INDEX-1,:].reset_index(drop=True)
    FRIMS_INFRA_DAMAGES_DF = FRIMS_INFRA_DAMAGES_DF.replace(r'\n','',regex=True)
    
    FRIMS_INFRA_DAMAGES_DF.columns=FRIMS_INFRA_DAMAGES_DF.iloc[0].str.replace(r'\n','',regex=True)
    FRIMS_INFRA_DAMAGES_DF = FRIMS_INFRA_DAMAGES_DF.loc[1:,:]
    
    return FRIMS_INFRA_DAMAGES_DF

## Download PDFs <a class="anchor" id="download"></a>

Download all PDFs from [FRIMS](http://www.asdma.gov.in/reports.html) portal

In [6]:
for month in range(6,8):
    if month in [6]:
        max_date=31
        min_date=1
    elif month in [7]:
        max_date=19
        min_date=1


    for day in range(min_date,max_date):
        date = str(day)+'-'+str(month)+'-'+'2023'
        if type(date)==str:
            date = datetime.strptime(date, '%d-%m-%Y').date()
        else:
            date = date + timedelta(days=-1)
        
        if date.month<10:
            date_month = '0'+str(date.month)
        else:
            date_month = str(date.month)
        
        if date.day<10:
            date_day = '0'+str(date.day)
        else:
            date_day = str(date.day)
        
        date_string = date_day+'.'+date_month+'.'+str(date.year)
        print(date_string)
        
        daily_report_url = 'https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_'+date_string+'.pdf'
        print(daily_report_url)
        urllib.request.urlretrieve(daily_report_url, r"FRIMS_Reports_2023/FRIMS_"+date_string+".pdf")

01.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_01.06.2023.pdf
02.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_02.06.2023.pdf
03.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_03.06.2023.pdf
04.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_04.06.2023.pdf
05.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_05.06.2023.pdf
06.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_06.06.2023.pdf
07.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_07.06.2023.pdf
08.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_08.06.2023.pdf
09.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_09.06.2023.pdf
10.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_10.06.2023.pdf
11.06.2023
https://www.asdma.gov.in/pdf/flood_report/2023/Daily_Flood_Report_11.

In [7]:
frims_pdfs = glob.glob('FRIMS_Reports_2023/*.pdf')
for pdf in frims_pdfs:
    print(pdf)
    date_string = pdf.split('FRIMS_')[-1].split('.pdf')[0]
    tables = camelot.read_pdf(pdf,pages='all')
    df = pd.DataFrame()
    for i in range(0,len(tables)):
        df = pd.concat([df,tables[i].df],axis=0, ignore_index=True)
    
    df.to_csv("FRIMS_Reports_2023/FRIMS_"+date_string+".csv", index=False)

FRIMS_Reports_2023/FRIMS_27.06.2023.pdf
FRIMS_Reports_2023/FRIMS_04.07.2023.pdf
FRIMS_Reports_2023/FRIMS_13.07.2023.pdf
FRIMS_Reports_2023/FRIMS_07.06.2023.pdf
FRIMS_Reports_2023/FRIMS_24.06.2023.pdf
FRIMS_Reports_2023/FRIMS_01.07.2023.pdf
FRIMS_Reports_2023/FRIMS_06.06.2023.pdf
FRIMS_Reports_2023/FRIMS_26.06.2023.pdf
FRIMS_Reports_2023/FRIMS_16.06.2023.pdf
FRIMS_Reports_2023/FRIMS_16.07.2023.pdf
FRIMS_Reports_2023/FRIMS_02.07.2023.pdf
FRIMS_Reports_2023/FRIMS_10.06.2023.pdf
FRIMS_Reports_2023/FRIMS_18.06.2023.pdf
FRIMS_Reports_2023/FRIMS_05.06.2023.pdf
FRIMS_Reports_2023/FRIMS_08.06.2023.pdf
FRIMS_Reports_2023/FRIMS_04.06.2023.pdf
FRIMS_Reports_2023/FRIMS_11.07.2023.pdf
FRIMS_Reports_2023/FRIMS_23.06.2023.pdf
FRIMS_Reports_2023/FRIMS_21.06.2023.pdf
FRIMS_Reports_2023/FRIMS_03.06.2023.pdf
FRIMS_Reports_2023/FRIMS_30.06.2023.pdf
FRIMS_Reports_2023/FRIMS_06.07.2023.pdf
FRIMS_Reports_2023/FRIMS_18.07.2023.pdf
FRIMS_Reports_2023/FRIMS_14.07.2023.pdf
FRIMS_Reports_2023/FRIMS_17.07.2023.pdf


# INFRA DAMAGES <a class="anchor" id="infradamages"></a>

In [8]:
dates = []
for file in glob.glob('FRIMS_Reports_2023/FRIMS_*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)

issue_dates = []

In [9]:
slug_lists = [['infrastructure damaged - road','infrastructure damaged - roads'],
              ['infrastructure damaged - embankments affected','infrastructure damaged - embankment affected'],
              ['infrastructure damaged - bridge','infrastructure damaged - bridges'],
              ['infrastructure damaged - embankments breached','infrastructure damaged - embankment breached'],
              ]

folder_slug_dict = dict()
folder_slug_dict[0] ='FRIMS_ROADS_DAMAGED'
folder_slug_dict[1] ='FRIMS_EMBANKMENTS_AFFECTED'
folder_slug_dict[2] ='FRIMS_BRIDGES_DAMAGED'
folder_slug_dict[3] ='FRIMS_EMBANKMENTS_BREACHED'

In [10]:
road_issue_dates = []
embankment_affected_issue_dates = []
bridge_issue_dates = []
embankment_breached_issue_dates = []

issues_dates = [road_issue_dates,
                embankment_affected_issue_dates,
               bridge_issue_dates,
                embankment_breached_issue_dates,
               ]

In [11]:
for date in dates:    
    print(date)
    FRIMS_csv_file = r"FRIMS_Reports_2023/FRIMS_"+date+".csv"
    
    
    FRIMS_DF = pd.read_csv(FRIMS_csv_file)
    
    FRIMS_DF.iloc[:,0] = FRIMS_DF.iloc[:,0].str.replace(r'\n','',regex=True)
    FRIMS_DF.iloc[:,0] = FRIMS_DF.iloc[:,0].str.lower()
    
    for list_number, slug_list in enumerate(slug_lists):
        folder_slug = folder_slug_dict[list_number]
        print(folder_slug)
        
        try:
            TABLE_START_INDEX = get_table_start_index(FRIMS_DF, slug_list)
        except:
            issues_dates[list_number].append(date)
            print('Issue with infra damages table - Row header across multiple pages')
            print("----")
            continue
            
        if folder_slug=='FRIMS_URBANFLOOD':
            TABLE_END_INDEX = TABLE_START_INDEX+100
        else:
            TABLE_END_INDEX = get_table_end_index(FRIMS_DF, TABLE_START_INDEX)

        if TABLE_END_INDEX-1 <= TABLE_START_INDEX:
            print("No data for: ",date)
            #done_dates.append(date)
            print("----")
            continue
        
        try:   
            FRIMS_INFRA_DAMAGES_DF = extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX-1)
        except:
            print("No dataa for: ",date)
            #issues_dates[list_number].append(date)
            print("----")
            continue
        
        try:
            col_name = FRIMS_INFRA_DAMAGES_DF.columns[1]
            FRIMS_INFRA_DAMAGES_DF[col_name] = FRIMS_INFRA_DAMAGES_DF[col_name].replace('',None).fillna(method='ffill')
            g = FRIMS_INFRA_DAMAGES_DF.groupby(col_name)['Details'].transform(lambda x: ' '.join(x))
        except:
            print('Issues with cleaning and combining')
            issues_dates[list_number].append(date)
            print("----")
            continue
            
        FRIMS_INFRA_DAMAGES_DF['Details'] = g
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF.drop_duplicates()
        
        
        date = date.replace('.','-')
        FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF_CLEANED[['Date', col_name, 'Number', 'Details']]
        FRIMS_INFRA_DAMAGES_DF_CLEANED.columns = ['Date', 'District', 'Number', 'Details']
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF_CLEANED[FRIMS_INFRA_DAMAGES_DF_CLEANED['Number'].notna()]
        FRIMS_INFRA_DAMAGES_DF_CLEANED.reset_index(drop=True).to_csv(r'Data_2023/Scraped Data/'+folder_slug+r'/'+folder_slug+'_'+str(date)+'.csv', index=False)
        print('----')

27.06.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  27-06-2023
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
----
04.07.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  04-07-2023
----
FRIMS_BRIDGES_DAMAGED
No data for:  04-07-2023
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  04-07-2023
----
13.07.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  13-07-2023
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  13-07-2023
----
07.06.2023
FRIMS_ROADS_DAMAGED
No data for:  07.06.2023
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  07.06.2023
----
FRIMS_BRIDGES_DAMAGED
No data for:  07.06.2023
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  07.06.2023
----
24.06.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
Issue with infra damages table - Row header across multiple pages
----
01.07.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date


----
FRIMS_EMBANKMENTS_BREACHED
----
21.06.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
----
03.06.2023
FRIMS_ROADS_DAMAGED
No data for:  03.06.2023
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  03.06.2023
----
FRIMS_BRIDGES_DAMAGED
No data for:  03.06.2023
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  03.06.2023
----
30.06.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
Issues with cleaning and combining
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  30-06-2023
----
06.07.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
No data for:  06-07-2023
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  06-07-2023
----
18.07.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  18-07-2023
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  18-07-2023
----
14.07.2023
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----


In [12]:
issues_df = pd.DataFrame(issues_dates).T
issues_df.columns = folder_slug_dict.values()
issues_df

#Add this manually

Unnamed: 0,FRIMS_ROADS_DAMAGED,FRIMS_EMBANKMENTS_AFFECTED,FRIMS_BRIDGES_DAMAGED,FRIMS_EMBANKMENTS_BREACHED
0,02.07.2023,12-07-2023,26-06-2023,24-06-2023
1,28.06.2023,,30-06-2023,29-06-2023
2,10.07.2023,,05-07-2023,


In [13]:
for folder_slug in folder_slug_dict.values():
    scraped_files_daily = glob.glob(r'Data_2023/Scraped Data/{}/*.csv'.format(folder_slug))
    
    dfs = []
    for file in scraped_files_daily:
        df = pd.read_csv(file)
        dfs.append(df)
    
    FRIMS_DAMAGES = pd.concat(dfs)
    FRIMS_DAMAGES['Date'] = pd.to_datetime(FRIMS_DAMAGES['Date'],format='%d-%m-%Y')
    FRIMS_DAMAGES = FRIMS_DAMAGES.sort_values(by='Date')
    
    FRIMS_DAMAGES['District'] = FRIMS_DAMAGES.District.str.replace('Dima-Hasao','Dima Hasao',regex=True)
    FRIMS_DAMAGES['District'] = FRIMS_DAMAGES.District.str.upper()
    
    FRIMS_DAMAGES = pd.merge(FRIMS_DAMAGES,assam_districts, how='left').drop('geometry', axis=1)

    FRIMS_DAMAGES.drop_duplicates().dropna().to_csv('Data_2023/Cleaned Data/DISTRICTS_{}_MASTER_2023.csv'.format(folder_slug), index=False)