In [1]:
import urllib.request
import camelot
import pandas as pd
import regex as re
import pygsheets
from datetime import date, timedelta, datetime
import numpy as np
import glob

from joblib import Parallel, delayed
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

# Table of Contents:
* [Functions to be used](#functions)
* [Download PDFs](#download)
* [Sample PDF Scraper](#sample)
* [Scraper for relief distributed, houses and human lives lost](#misc)
* [Scraper for Animals affected and washed away tables](#animals)
* [Scraper for Other Damages tables - other infra, urban flood, landslide, wildlife, erosion](#otherdamages)
* [Scraper for infrastructure damage tables](#infradamages)
* [Scraper for population, crop and relelif camp tables](#population)

## Functions <a class="anchor" id="functions"></a>

In [2]:
#Sequence Matcher helps us get the metric that measures how two strings are matching
from difflib import SequenceMatcher

#We will write a function that gives us matching score between two strings a and b. Higher the score,better the match
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [3]:
# One FRIMS PDF has multiple tables that have to be scraped. The following functions are used to isolate the tables based on their categories. 
def get_table_start_index(FRIMS_DF, slug_list):
    '''
    :param FRIMS_DF: The FRIMS Data Frame of a particular date.
    :param slug_list: A list of keywords used to identify a particular table in the PDF.
    
    :return: Returns the index of the first row of the intended table.
    '''
    TABLE_START_INDEX = FRIMS_DF[FRIMS_DF[0].isin(slug_list)].index.values[0]
    return TABLE_START_INDEX

def get_table_end_index(FRIMS_DF, TABLE_START_INDEX):
    '''
    :param FRIMS_DF: The FRIMS Data Frame of a particular date.
    :param TABLE_START_INDEX: Once the index of a table's first row is found, it is passed into this function.
    
    :return: Returns the index of the last row of the intended table.
    '''
    for index,row in FRIMS_DF[TABLE_START_INDEX+1:].iterrows():
        if row[0]=='':
            continue
        else:
            TABLE_END_INDEX = index
            return TABLE_END_INDEX
            break
    return TABLE_START_INDEX+100

In [4]:
def extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX):
    '''
    :param FRIMS_DF: The FRIMS Data Frame of a particular date.
    :param TABLE_START_INDEX: Once the index of a table's first row is found, it is passed into this function.
    :param TABLE_END_INDEX: Once the index of a table's last row is found, it is passed into this function.
    
    :return: Returns the filtered table between the indices passed, after cleaning it.
    '''
    FRIMS_INFRA_DAMAGES_DF = FRIMS_DF.loc[TABLE_START_INDEX:TABLE_END_INDEX-1,:].reset_index(drop=True)
    FRIMS_INFRA_DAMAGES_DF = FRIMS_INFRA_DAMAGES_DF.replace(r'\n','',regex=True)
    
    FRIMS_INFRA_DAMAGES_DF.columns=FRIMS_INFRA_DAMAGES_DF.iloc[0].str.replace(r'\n','',regex=True)
    FRIMS_INFRA_DAMAGES_DF = FRIMS_INFRA_DAMAGES_DF.loc[1:,:]
    
    return FRIMS_INFRA_DAMAGES_DF

## Download PDFs <a class="anchor" id="download"></a>

Download all PDFs from [FRIMS](http://www.asdma.gov.in/reports.html) portal

In [7]:
for month in range(8,9):
    if month in [8,10]:
        max_date=32
        min_date=1
    elif month in [9]:
        max_date=31
        min_date=1


    for day in range(min_date,max_date):
        date = str(day)+'-'+str(month)+'-'+'2022'
        if type(date)==str:
            date = datetime.strptime(date, '%d-%m-%Y').date()
        else:
            date = date + timedelta(days=-1)
        
        if date.month<10:
            date_month = '0'+str(date.month)
        else:
            date_month = str(date.month)
        
        if date.day<10:
            date_day = '0'+str(date.day)
        else:
            date_day = str(date.day)
        
        date_string = date_day+'.'+date_month+'.'+str(date.year)
        print(date_string)
        
        daily_report_url = 'http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_'+date_string+'.pdf'
        print(daily_report_url)
        urllib.request.urlretrieve(daily_report_url, r"D:/Projects/assam-tender-scraper/FRIMS_Daily_Reports_Scraper/FRIMS_Reports/FRIMS_"+date_string+".pdf")

01.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_01.08.2022.pdf
02.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_02.08.2022.pdf
03.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_03.08.2022.pdf
04.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_04.08.2022.pdf
05.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_05.08.2022.pdf
06.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_06.08.2022.pdf
07.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_07.08.2022.pdf
08.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_08.08.2022.pdf
09.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_09.08.2022.pdf
10.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_10.08.2022.pdf
11.08.2022
http://www.asdma.gov.in/pdf/flood_report/2022/Daily_Flood_Report_11.08.2022.pdf

## Sample PDF scraper <a class="anchor" id="sample"></a>

In [5]:
date = '11.10.2022'
pdf_file = r"FRIMS_Reports/FRIMS_"+date+".pdf"
tables = camelot.read_pdf(pdf_file,pages='all')
df = pd.DataFrame()
for i in range(0,len(tables)):
    df = pd.concat([df,tables[i].df],axis=0, ignore_index=True)
    
df[0] = df[0].str.replace(r'\n','')
df[0] = df[0].str.lower()

  df[0] = df[0].str.replace(r'\n','')


In [7]:
slug_list = ['relief distributed'] # This slug is to identify the relief distributed table in the PDF
TABLE_START_INDEX = df[df[0].isin(slug_list)].index.values[0]
TABLE_START_INDEX

40

In [8]:
for index,row in df[TABLE_START_INDEX+1:].iterrows():
    if row[0]=='':
        continue
    else:
        TABLE_END_INDEX = index
        break
TABLE_END_INDEX

43

In [10]:
df_filtered = df.loc[TABLE_START_INDEX:TABLE_END_INDEX-1,:]
df_filtered = df_filtered.replace(r'\n','',regex=True)
df_filtered

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
40,relief distributed,District,Rice (in Q),Dal (in Q),Salt (in Q),M. Oil (in L),Cattle Feed - Green Fooder (in Q),Cattle Feed - Wheat Bran (in Q),Cattle Feed - Rice Bran (in Q),,
41,,Dhemaji,111.21,20.18,6.05,605.00,0.00,0.00,0.00,,
42,,Total,111.21,20.18,6.05,605.00,0.00,0.00,0.00,,


In [14]:
df_filtered = df.loc[TABLE_START_INDEX:TABLE_END_INDEX-1,:].reset_index(drop=True)
df_filtered.columns=df_filtered.iloc[0].str.replace(r'\n', '', regex=True)
df_filtered = df_filtered.loc[1:,:]

for column in df_filtered.columns:
    df_filtered[column] = df_filtered[column].replace(r'\n', '', regex=True)

df_filtered

Unnamed: 0,relief distributed,District,Rice (in Q),Dal (in Q),Salt (in Q),M. Oil (in L),Cattle Feed - Green Fooder (in Q),Cattle Feed - Wheat Bran (in Q),Cattle Feed - Rice Bran (in Q),Unnamed: 10,Unnamed: 11
1,,Dhemaji,111.21,20.18,6.05,605.0,0.0,0.0,0.0,,
2,,Total,111.21,20.18,6.05,605.0,0.0,0.0,0.0,,


In [16]:
df_filtered['District'] = df_filtered['District'].replace('',None).fillna(method='bfill')

#This code snippet is useful when there is any column in the table that has multiple rows per district.
#g = df_filtered.groupby('District')['Details'].transform(lambda x: ' '.join(x))
#df_filtered['Details'] = g
df_filtered.drop_duplicates()

Unnamed: 0,relief distributed,District,Rice (in Q),Dal (in Q),Salt (in Q),M. Oil (in L),Cattle Feed - Green Fooder (in Q),Cattle Feed - Wheat Bran (in Q),Cattle Feed - Rice Bran (in Q),Unnamed: 10,Unnamed: 11
1,,Dhemaji,111.21,20.18,6.05,605.0,0.0,0.0,0.0,,
2,,Total,111.21,20.18,6.05,605.0,0.0,0.0,0.0,,


## MISC <a class="anchor" id="misc"></a>

In [82]:
dates = []
for file in glob.glob(r'FRIMS_Reports/FRIMS_*.08*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)

# Store PDFs that throw an error in this list.
issue_dates = []

In [83]:
slug_lists = [['relief distributed'],
              ['relief distributed others'],
              ['rescue operation'],
              ['houses damaged','house damaged'],
              ['human lives lost - confirmed'],
              ['human lives lost confirmed - death type'],
              ['human lives lost - missing'],
              ['human lives lost missing - type']
              ]

folder_slug_dict = dict()
folder_slug_dict[0] ='RELIEF_DISTRIBUTED'
folder_slug_dict[1] ='RELIEF_DISTRIBUTED_OTHERS'
folder_slug_dict[2] ='RESCUE_OPERATION'
folder_slug_dict[3] ='HOUSES_DAMAGED'
folder_slug_dict[4] ='HUMAN_LIVES_LOST_CONFIRMED'
folder_slug_dict[5] ='HUMAN_LIVES_LOST_CONFIRMED_DEATHTYPE'
folder_slug_dict[6] ='HUMAN_LIVES_LOST_MISSING'
folder_slug_dict[7] ='HUMAN_LIVES_LOST_MISSING_TYPE'


In [84]:
relief_distributed_issue_dates = []
relief_distributed_others_issue_dates = []
rescue_ops_issue_dates = []
houses_damaged_issue_dates = []
humanlives_confirmed_issues_dates = []
humanlives_confirmed_deathtype_issues_dates = []
humanlives_missing_issues_dates = []
humanlives_missing_type_issues_dates = []

issues_dates = [relief_distributed_issue_dates,
                relief_distributed_others_issue_dates,
                rescue_ops_issue_dates,
                houses_damaged_issue_dates,
                humanlives_confirmed_issues_dates,
                humanlives_confirmed_deathtype_issues_dates,
                humanlives_missing_issues_dates,
                humanlives_missing_type_issues_dates
               ]

In [85]:
for date in dates:    
    print(date)
    FRIMS_pdf_file = r"FRIMS_Reports/FRIMS_"+date+".pdf"
    tables = camelot.read_pdf(FRIMS_pdf_file,pages='all')
    
    FRIMS_DF = pd.DataFrame()
    
    for i in range(0,len(tables)):
        FRIMS_DF = pd.concat([FRIMS_DF,tables[i].df],axis=0, ignore_index=True)
    
    FRIMS_DF[0] = FRIMS_DF[0].str.replace(r'\n','',regex=True)
    FRIMS_DF[0] = FRIMS_DF[0].str.lower()
    
    for list_number, slug_list in enumerate(slug_lists):
        folder_slug = folder_slug_dict[list_number]
        print(folder_slug)
        
        try:
            TABLE_START_INDEX = get_table_start_index(FRIMS_DF, slug_list)
        except:
            issues_dates[list_number].append(date)
            print('Issue with table - Row header across multiple pages')
            print("----")
            continue
        
        TABLE_END_INDEX = get_table_end_index(FRIMS_DF, TABLE_START_INDEX)

        if TABLE_END_INDEX-1 <= TABLE_START_INDEX:
            print("No data for: ",date)
            #done_dates.append(date)
            print("----")
            continue
        
        
        try:   
            FRIMS_INFRA_DAMAGES_DF = extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX)
        except:
            print("Unable to extract for: ",date)
            issues_dates[list_number].append(date)
            print("----")
            continue
            
        #FRIMS_INFRA_DAMAGES_DF['District'] = FRIMS_INFRA_DAMAGES_DF['District'].replace('',None).fillna(method='bfill')
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF.drop_duplicates()
        
        
        #done_dates.append(date)
        date = date.replace('.','-')
        FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date
        FRIMS_INFRA_DAMAGES_DF_CLEANED.reset_index(drop=True).to_csv(r'Data/Scraped Data/'+folder_slug+r'/'+folder_slug+'_'+str(date)+'.csv')
        print('----')

01.08.2022
RELIEF_DISTRIBUTED
----
RELIEF_DISTRIBUTED_OTHERS
----
RESCUE_OPERATION
----
HOUSES_DAMAGED
----
HUMAN_LIVES_LOST_CONFIRMED
No data for:  01-08-2022
----
HUMAN_LIVES_LOST_CONFIRMED_DEATHTYPE
No data for:  01-08-2022
----
HUMAN_LIVES_LOST_MISSING
No data for:  01-08-2022
----
HUMAN_LIVES_LOST_MISSING_TYPE
No data for:  01-08-2022
----
02.08.2022
RELIEF_DISTRIBUTED
----
RELIEF_DISTRIBUTED_OTHERS
----
RESCUE_OPERATION
No data for:  02-08-2022
----
HOUSES_DAMAGED
No data for:  02-08-2022
----
HUMAN_LIVES_LOST_CONFIRMED
No data for:  02-08-2022
----
HUMAN_LIVES_LOST_CONFIRMED_DEATHTYPE
No data for:  02-08-2022
----
HUMAN_LIVES_LOST_MISSING
No data for:  02-08-2022
----
HUMAN_LIVES_LOST_MISSING_TYPE
No data for:  02-08-2022
----
03.08.2022
RELIEF_DISTRIBUTED
----
RELIEF_DISTRIBUTED_OTHERS
----
RESCUE_OPERATION
No data for:  03-08-2022
----
HOUSES_DAMAGED
----
HUMAN_LIVES_LOST_CONFIRMED
No data for:  03-08-2022
----
HUMAN_LIVES_LOST_CONFIRMED_DEATHTYPE
No data for:  03-08-2022
----

RELIEF_DISTRIBUTED
No data for:  21.08.2022
----
RELIEF_DISTRIBUTED_OTHERS
No data for:  21.08.2022
----
RESCUE_OPERATION
No data for:  21.08.2022
----
HOUSES_DAMAGED
No data for:  21.08.2022
----
HUMAN_LIVES_LOST_CONFIRMED
No data for:  21.08.2022
----
HUMAN_LIVES_LOST_CONFIRMED_DEATHTYPE
No data for:  21.08.2022
----
HUMAN_LIVES_LOST_MISSING
No data for:  21.08.2022
----
HUMAN_LIVES_LOST_MISSING_TYPE
No data for:  21.08.2022
----
22.08.2022
RELIEF_DISTRIBUTED
No data for:  22.08.2022
----
RELIEF_DISTRIBUTED_OTHERS
Issue with table - Row header across multiple pages
----
RESCUE_OPERATION
No data for:  22.08.2022
----
HOUSES_DAMAGED
----
HUMAN_LIVES_LOST_CONFIRMED
No data for:  22-08-2022
----
HUMAN_LIVES_LOST_CONFIRMED_DEATHTYPE
No data for:  22-08-2022
----
HUMAN_LIVES_LOST_MISSING
No data for:  22-08-2022
----
HUMAN_LIVES_LOST_MISSING_TYPE
No data for:  22-08-2022
----
23.08.2022
RELIEF_DISTRIBUTED
No data for:  23.08.2022
----
RELIEF_DISTRIBUTED_OTHERS
No data for:  23.08.2022
----

In [87]:
issues_df = pd.DataFrame(issues_dates).T
issues_df.columns = folder_slug_dict.values()
issues_df.to_csv('ISSUES.csv',index=False)

In [95]:
scraped_files_daily = glob.glob(r'Data/Scraped Data/HUMAN_LIVES_LOST_MISSING_TYPE/*.csv')

dfs = []
for file in scraped_files_daily:
    df = pd.read_csv(file)
    df = df.iloc[:-1,:]
    dfs.append(df)
    
RELIEF_DISTRIBUTED = pd.concat(dfs)
RELIEF_DISTRIBUTED['Date'] = pd.to_datetime(RELIEF_DISTRIBUTED['Date'],format='%d-%m-%Y')
RELIEF_DISTRIBUTED = RELIEF_DISTRIBUTED.sort_values(by='Date')
RELIEF_DISTRIBUTED.to_csv('Data/Cleaned Data/FRIMS_HUMAN_LIVES_LOST_MISSING_TYPE_MASTER_2022.csv')

# ANIMALS <a class="anchor" id="animals"></a>

In [39]:
dates = []
for file in glob.glob(r'FRIMS_Reports/FRIMS_*.10*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)
#done_dates = []
issue_dates = []

In [40]:
slug_lists = [['animals affected','animal affected'],
              ['animals washed away','animal washed away'],
              ]

folder_slug_dict = dict()
folder_slug_dict[0] ='FRIMS_ANIMALS_AFFECTED'
folder_slug_dict[1] ='FRIMS_ANIMALS_WASHED'

In [41]:
animals_affected_issue_dates = []
animals_washed_issue_dates = []

issues_dates = [
                animals_affected_issue_dates,
    animals_washed_issue_dates,
               ]

In [42]:
column_names = ['Total','Big','Small','Poultry']

In [43]:
for date in dates:    
    print(date)
    FRIMS_pdf_file = r"FRIMS_Reports/FRIMS_"+date+".pdf"
    tables = camelot.read_pdf(FRIMS_pdf_file,pages='all')
    
    FRIMS_DF = pd.DataFrame()
    
    for i in range(0,len(tables)):
        FRIMS_DF = pd.concat([FRIMS_DF,tables[i].df],axis=0, ignore_index=True)
    
    FRIMS_DF[0] = FRIMS_DF[0].str.replace(r'\n','',regex=True)
    FRIMS_DF[0] = FRIMS_DF[0].str.lower()
    for list_number, slug_list in enumerate(slug_lists):
        folder_slug = folder_slug_dict[list_number]
        print(folder_slug)
        
        try:
            TABLE_START_INDEX = get_table_start_index(FRIMS_DF, slug_list)
        except:
            issues_dates[list_number].append(date)
            print('Issue with animals table - Row header across multiple pages')
            print("----")
            continue
        
        TABLE_END_INDEX = get_table_end_index(FRIMS_DF, TABLE_START_INDEX)

        if TABLE_END_INDEX-1 <= TABLE_START_INDEX:
            print("No data for: ",date)
            print("----")
            continue
        
        try:   
            FRIMS_INFRA_DAMAGES_DF = extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX, column_names)
        except:
            print("Unable to extract for: ",date)
            issues_dates[list_number].append(date)
            print("----")
            continue
            
        #FRIMS_INFRA_DAMAGES_DF['District'] = FRIMS_INFRA_DAMAGES_DF['District'].replace('',None).fillna(method='bfill')
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF.drop_duplicates()
        
        
        date = date.replace('.','-')
        FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date
        FRIMS_INFRA_DAMAGES_DF_CLEANED.reset_index(drop=True).to_csv(r'Data/Scraped Data/'+folder_slug+r'/'+folder_slug+'_'+str(date)+'.csv')
        print('----')

01.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  01.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  01.10.2022
----
02.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  02.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  02.10.2022
----
03.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  03.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  03.10.2022
----
04.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  04.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  04.10.2022
----
05.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  05.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  05.10.2022
----
06.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  06.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  06.10.2022
----
07.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  07.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  07.10.2022
----
08.10.2022
FRIMS_ANIMALS_AFFECTED
No data for:  08.10.2022
----
FRIMS_ANIMALS_WASHED
No data for:  08.10.2022
----
09.10.2022
FRIMS_ANIMALS_AFFECTED
----
FRIMS_ANIMALS_WASHED
No data for:  09-10-

In [44]:
issues_df = pd.DataFrame(issues_dates).T
issues_df.columns = folder_slug_dict.values()
issues_df.to_csv('ISSUES.csv',index=False)

## PDFs on these dates are to be manually scraped.

In [46]:
scraped_files_daily = glob.glob(r'Data/Scraped Data/FRIMS_ANIMALS_AFFECTED/*.csv')

dfs = []
for file in scraped_files_daily:
    df = pd.read_csv(file)
    df = df.iloc[:-1,:]
    dfs.append(df)
    
FRIMS_ANIMALS_AFFECTED = pd.concat(dfs)
FRIMS_ANIMALS_AFFECTED['Date'] = pd.to_datetime(FRIMS_ANIMALS_AFFECTED['Date'],format='%d-%m-%Y')
FRIMS_ANIMALS_AFFECTED = FRIMS_ANIMALS_AFFECTED.sort_values(by='Date')
FRIMS_ANIMALS_AFFECTED.to_csv('Data/Cleaned Data/FRIMS_ANIMALS_AFFECTED_MASTER_2022.csv')

# OTHER DAMAGES <a class="anchor" id="otherdamages"></a>

In [59]:
dates = []
for file in glob.glob('FRIMS_Reports/FRIMS_*.08*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)
issue_dates = []

In [60]:
slug_lists = [['infrastructure damaged - others','infrastructure damaged - other'],
              ['infrastructure damaged - wildlife'],
              ['erosion'],
              ['landslide','land slide','landslides','landslide'],
              ['urban flood','urbanflood','urbanfloods','urban floods']
              ]

folder_slug_dict = dict()
folder_slug_dict[0] ='FRIMS_OTHER_DAMAGES'
folder_slug_dict[1] ='FRIMS_WILDLIFE'
folder_slug_dict[2] ='FRIMS_EROSION'
folder_slug_dict[3] ='FRIMS_LANDSLIDE'
folder_slug_dict[4] ='FRIMS_URBANFLOOD'

In [61]:
other_damages_issue_dates = []
wildlife_issue_dates = []
erosion_issue_dates = []
landslide_issue_dates = []
urbanflood_issue_dates = []

issues_dates = [other_damages_issue_dates,
                wildlife_issue_dates,
               erosion_issue_dates,
                landslide_issue_dates,
                urbanflood_issue_dates
               ]

In [62]:
for date in dates:    
    print(date)
    FRIMS_pdf_file = r"FRIMS_Reports/FRIMS_"+date+".pdf"
    tables = camelot.read_pdf(FRIMS_pdf_file,pages='all')
    
    FRIMS_DF = pd.DataFrame()
    
    for i in range(0,len(tables)):
        FRIMS_DF = pd.concat([FRIMS_DF,tables[i].df],axis=0, ignore_index=True)
    
    FRIMS_DF[0] = FRIMS_DF[0].str.replace(r'\n','',regex=True)
    FRIMS_DF[0] = FRIMS_DF[0].str.lower()
    for list_number, slug_list in enumerate(slug_lists):
        folder_slug = folder_slug_dict[list_number]
        print(folder_slug)
        
        if folder_slug=='FRIMS_OTHER_DAMAGES':
            column_name = 'Other Details'
        elif folder_slug == 'FRIMS_WILDLIFE':
            column_name = 'Wildlife affected under protected areas description'
        else:
            column_name = 'Details'
        
        try:
            TABLE_START_INDEX = get_table_start_index(FRIMS_DF, slug_list)
        except:
            issues_dates[list_number].append(date)
            print('Issue with infra damages table - Row header across multiple pages')
            print("----")
            continue
            
        if folder_slug=='FRIMS_URBANFLOOD':
            TABLE_END_INDEX = TABLE_START_INDEX+100
        else:
            TABLE_END_INDEX = get_table_end_index(FRIMS_DF, TABLE_START_INDEX)

            if TABLE_END_INDEX-1 <= TABLE_START_INDEX:
                print("No data for: ",date)
                print("----")
                continue
        
        try:   
            FRIMS_INFRA_DAMAGES_DF = extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX, column_name)
        except:
            print("No dataa for: ",date)
            print("----")
            continue
        
        try:
            FRIMS_INFRA_DAMAGES_DF['District'] = FRIMS_INFRA_DAMAGES_DF['District'].replace('',None).fillna(method='bfill')
            g = FRIMS_INFRA_DAMAGES_DF.groupby('District')[column_name].transform(lambda x: ' '.join(x))
        except:
            print('Issues with cleaning and combining')
            issues_dates[list_number].append(date)
            print("----")
            continue
        FRIMS_INFRA_DAMAGES_DF[column_name] = g
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF.drop_duplicates()
        
        date = date.replace('.','-')
        FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date
        FRIMS_INFRA_DAMAGES_DF_CLEANED.reset_index(drop=True).to_csv(r'Data/Scraped Data/'+folder_slug+r'/'+folder_slug+'_'+str(date)+'.csv')
        print('----')

01.08.2022
FRIMS_OTHER_DAMAGES
----
FRIMS_WILDLIFE
No data for:  01-08-2022
----
FRIMS_EROSION
----
FRIMS_LANDSLIDE
No data for:  01-08-2022
----
FRIMS_URBANFLOOD
----
02.08.2022
FRIMS_OTHER_DAMAGES
----
FRIMS_WILDLIFE
No data for:  02-08-2022
----
FRIMS_EROSION
----
FRIMS_LANDSLIDE
No data for:  02-08-2022
----
FRIMS_URBANFLOOD
----
03.08.2022
FRIMS_OTHER_DAMAGES
----
FRIMS_WILDLIFE
No data for:  03-08-2022
----
FRIMS_EROSION
----
FRIMS_LANDSLIDE
----
FRIMS_URBANFLOOD
----
04.08.2022
FRIMS_OTHER_DAMAGES
----
FRIMS_WILDLIFE
No data for:  04-08-2022
----
FRIMS_EROSION
----
FRIMS_LANDSLIDE
No data for:  04-08-2022
----
FRIMS_URBANFLOOD
----
05.08.2022
FRIMS_OTHER_DAMAGES
----
FRIMS_WILDLIFE
No data for:  05-08-2022
----
FRIMS_EROSION
----
FRIMS_LANDSLIDE
No data for:  05-08-2022
----
FRIMS_URBANFLOOD
----
06.08.2022
FRIMS_OTHER_DAMAGES
----
FRIMS_WILDLIFE
No data for:  06-08-2022
----
FRIMS_EROSION
----
FRIMS_LANDSLIDE
No data for:  06-08-2022
----
FRIMS_URBANFLOOD
----
07.08.2022
FRIMS_

In [63]:
issues_df = pd.DataFrame(issues_dates).T
issues_df.columns = folder_slug_dict.values()
issues_df.to_csv('ISSUES.csv',index=False)

In [77]:
scraped_files_daily = glob.glob(r'Data/Scraped Data/FRIMS_URBANFLOOD/*.csv')

dfs = []
for file in scraped_files_daily:
    df = pd.read_csv(file)
    df = df.iloc[:,:]
    dfs.append(df)
    
FRIMS_EROSION = pd.concat(dfs)
FRIMS_EROSION['Date'] = pd.to_datetime(FRIMS_EROSION['Date'],format='%d-%m-%Y')
FRIMS_EROSION = FRIMS_EROSION.sort_values(by='Date')
FRIMS_EROSION.to_csv('Data/Cleaned Data/FRIMS_URBANFLOOD_MASTER_2022.csv')

# INFRA DAMAGES <a class="anchor" id="infradamages"></a>

In [82]:
dates = []
for file in glob.glob('FRIMS_Reports/FRIMS_*.08*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)
for file in glob.glob('FRIMS_Reports/FRIMS_*.09*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)
for file in glob.glob('FRIMS_Reports/FRIMS_*.10*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)
issue_dates = []

In [83]:
slug_lists = [['infrastructure damaged - road','infrastructure damaged - roads'],
              ['infrastructure damaged - embankments affected','infrastructure damaged - embankment affected'],
              ['infrastructure damaged - bridge','infrastructure damaged - bridges'],
              ['infrastructure damaged - embankments breached','infrastructure damaged - embankment breached'],
              ]

folder_slug_dict = dict()
folder_slug_dict[0] ='FRIMS_ROADS_DAMAGED'
folder_slug_dict[1] ='FRIMS_EMBANKMENTS_AFFECTED'
folder_slug_dict[2] ='FRIMS_BRIDGES_DAMAGED'
folder_slug_dict[3] ='FRIMS_EMBANKMENTS_BREACHED'

In [84]:
road_issue_dates = []
embankment_affected_issue_dates = []
bridge_issue_dates = []
embankment_breached_issue_dates = []

issues_dates = [road_issue_dates,
                embankment_affected_issue_dates,
               bridge_issue_dates,
                embankment_breached_issue_dates,
               ]

In [85]:
for date in dates:    
    print(date)
    FRIMS_pdf_file = r"FRIMS_Reports/FRIMS_"+date+".pdf"
    tables = camelot.read_pdf(FRIMS_pdf_file,pages='all')
    
    FRIMS_DF = pd.DataFrame()
    
    for i in range(0,len(tables)):
        FRIMS_DF = pd.concat([FRIMS_DF,tables[i].df],axis=0, ignore_index=True)
    
    FRIMS_DF[0] = FRIMS_DF[0].str.replace(r'\n','',regex=True)
    FRIMS_DF[0] = FRIMS_DF[0].str.lower()
    
    for list_number, slug_list in enumerate(slug_lists):
        folder_slug = folder_slug_dict[list_number]
        print(folder_slug)
        
        if folder_slug=='FRIMS_OTHER_DAMAGES':
            column_name = 'Other Details'
        elif folder_slug == 'FRIMS_WILDLIFE':
            column_name = 'Wildlife affected under protected areas description'
        else:
            column_name = 'Details'
        
        try:
            TABLE_START_INDEX = get_table_start_index(FRIMS_DF, slug_list)
        except:
            issues_dates[list_number].append(date)
            print('Issue with infra damages table - Row header across multiple pages')
            print("----")
            continue
            
        if folder_slug=='FRIMS_URBANFLOOD':
            TABLE_END_INDEX = TABLE_START_INDEX+100
        else:
            TABLE_END_INDEX = get_table_end_index(FRIMS_DF, TABLE_START_INDEX)

            if TABLE_END_INDEX-1 <= TABLE_START_INDEX:
                print("No data for: ",date)
                #done_dates.append(date)
                print("----")
                continue
        
        try:   
            FRIMS_INFRA_DAMAGES_DF = extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX, column_name)
        except:
            print("No dataa for: ",date)
            #issues_dates[list_number].append(date)
            print("----")
            continue
        
        try:
            FRIMS_INFRA_DAMAGES_DF['District'] = FRIMS_INFRA_DAMAGES_DF['District'].replace('',None).fillna(method='bfill')
            g = FRIMS_INFRA_DAMAGES_DF.groupby('District')[column_name].transform(lambda x: ' '.join(x))
        except:
            print('Issues with cleaning and combining')
            issues_dates[list_number].append(date)
            print("----")
            continue
            
        FRIMS_INFRA_DAMAGES_DF[column_name] = g
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF.drop_duplicates()
        
        
        date = date.replace('.','-')
        FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date
        FRIMS_INFRA_DAMAGES_DF_CLEANED.reset_index(drop=True).to_csv(r'Data/Scraped Data/'+folder_slug+r'/'+folder_slug+'_'+str(date)+'.csv')
        print('----')

01.08.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
----
02.08.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
Issue with infra damages table - Row header across multiple pages
----
FRIMS_BRIDGES_DAMAGED
No data for:  02-08-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  02-08-2022
----
03.08.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  03-08-2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  03-08-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  03-08-2022
----
04.08.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
No data for:  04-08-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  04-08-2022
----
05.08.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_BRIDGES_DAMAGED
No data for:  05-08-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  05-08-2022
----
06.08.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
----
FRIMS_

FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  08-09-2022
----
FRIMS_BRIDGES_DAMAGED
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  08-09-2022
----
09.09.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  09-09-2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  09-09-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  09-09-2022
----
10.09.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  10-09-2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  10-09-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  10-09-2022
----
11.09.2022
FRIMS_ROADS_DAMAGED
No data for:  11.09.2022
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  11.09.2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  11.09.2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  11.09.2022
----
12.09.2022
FRIMS_ROADS_DAMAGED
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  12-09-2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  12-09-2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  12-09-2022
----
1

FRIMS_ROADS_DAMAGED
No data for:  17.10.2022
----
FRIMS_EMBANKMENTS_AFFECTED
Issues with cleaning and combining
----
FRIMS_BRIDGES_DAMAGED
No data for:  17.10.2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  17.10.2022
----
18.10.2022
FRIMS_ROADS_DAMAGED
No data for:  18.10.2022
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  18.10.2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  18.10.2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  18.10.2022
----
19.10.2022
FRIMS_ROADS_DAMAGED
No data for:  19.10.2022
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  19.10.2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  19.10.2022
----
FRIMS_EMBANKMENTS_BREACHED
Issue with infra damages table - Row header across multiple pages
----
20.10.2022
FRIMS_ROADS_DAMAGED
No data for:  20.10.2022
----
FRIMS_EMBANKMENTS_AFFECTED
No data for:  20.10.2022
----
FRIMS_BRIDGES_DAMAGED
No data for:  20.10.2022
----
FRIMS_EMBANKMENTS_BREACHED
No data for:  20.10.2022
----


In [87]:
issues_df = pd.DataFrame(issues_dates).T
issues_df.columns = folder_slug_dict.values()
issues_df.to_csv('ISSUES.csv',index=False)

In [91]:
scraped_files_daily = glob.glob(r'Data/Scraped Data/FRIMS_EMBANKMENTS_BREACHED/*.csv')

dfs = []
for file in scraped_files_daily:
    df = pd.read_csv(file)
    df = df.iloc[:-1,:]
    dfs.append(df)
    
FRIMS_BRIDGES_DAMAGED = pd.concat(dfs)
FRIMS_BRIDGES_DAMAGED['Date'] = pd.to_datetime(FRIMS_BRIDGES_DAMAGED['Date'],format='%d-%m-%Y')
FRIMS_BRIDGES_DAMAGED = FRIMS_BRIDGES_DAMAGED.sort_values(by='Date')
FRIMS_BRIDGES_DAMAGED=FRIMS_BRIDGES_DAMAGED[['District','Number','Details','Date']]
FRIMS_BRIDGES_DAMAGED.drop_duplicates().dropna().to_csv('Data/Cleaned Data/FRIMS_DAMAGE_EMBANKMENT_BREACHED_MASTER_2022.csv')

# Population affected, Crop Area Affected, Relief camps <a class="anchor" id="population"></a>

In [255]:
dates = []
for file in glob.glob('FRIMS_Reports/FRIMS_*.pdf'):
    date = file.split('FRIMS_')[-1].split('.pdf')[0]
    dates.append(date)

done_dates = []

In [256]:
slug_lists = [['population and crop area affected'],
              ['relief camps /centres opened','relief camps / centres opened'],
              ['inmates in relief camps'],
              ]

folder_slug_dict = dict()
folder_slug_dict[0] ='POPULATION_AND_CROP_AREA_AFFECTED'
folder_slug_dict[1] ='RELIEF_CAMPS_OPENED'
folder_slug_dict[2] ='RELIEF_CAMP_INMATES'

In [257]:
pop_issue_dates = []
camps_issue_dates = []
inmates_issue_dates = []

issues_dates = [pop_issue_dates,
                camps_issue_dates,
               inmates_issue_dates,
               ]

In [270]:
for date in list(set(dates)-set(done_dates)):
    done_dates.append(date)
    print(date)
    FRIMS_pdf_file = r"FRIMS_Reports/FRIMS_"+date+".pdf"
    tables = camelot.read_pdf(FRIMS_pdf_file,pages='all')
    
    FRIMS_DF = pd.DataFrame()
    for i in range(0,len(tables)):
        FRIMS_DF = pd.concat([FRIMS_DF,tables[i].df],axis=0, ignore_index=True)
    
    FRIMS_DF[0] = FRIMS_DF[0].str.replace(r'\n','',regex=True)
    FRIMS_DF[0] = FRIMS_DF[0].str.lower()
    
    
    for list_number, slug_list in enumerate(slug_lists):
        folder_slug = folder_slug_dict[list_number]
        print(folder_slug)
        
        if folder_slug=='POPULATION_AND_CROP_AREA_AFFECTED':
            column_name = ['Population Details','Crop Area Details']
        elif folder_slug == 'RELIEF_CAMPS_OPENED':
            column_name = ['Relief Camp','Relief Distribution Centres']
        elif folder_slug == 'RELIEF_CAMP_INMATES':
            column_name = ['Revenue Circlewise']
        
        try:
            TABLE_START_INDEX = get_table_start_index(FRIMS_DF, slug_list)
        except:
            issues_dates[list_number].append(date)
            print('Issue with table - Row header across multiple pages')
            print("----")
            continue
        
        TABLE_END_INDEX = get_table_end_index(FRIMS_DF, TABLE_START_INDEX)

        if TABLE_END_INDEX-1 <= TABLE_START_INDEX:
            print("No data for: ",date)
            print("----")
            continue
        
        try:   
            FRIMS_INFRA_DAMAGES_DF = extract_infra_damages_data(FRIMS_DF, TABLE_START_INDEX, TABLE_END_INDEX, column_names).reset_index(drop=True)
        except:
            print("Unable to extract for: ",date)
            issues_dates[list_number].append(date)
            print("----")
            continue
        
        FRIMS_INFRA_DAMAGES_DF['District'] = FRIMS_INFRA_DAMAGES_DF['District'].replace('',None).fillna(method='bfill')
        
        try:
            FRIMS_INFRA_DAMAGES_DF[column_name]
        except:
            print('Issues with cleaning and combining')
            issues_dates[list_number].append(date)
            print("----")
            continue
            
        for idx, row in FRIMS_INFRA_DAMAGES_DF.loc[:-1,:].iterrows():
            for column in column_name:
                if len(row[column])<10:
                    FRIMS_INFRA_DAMAGES_DF.loc[idx,column] = FRIMS_INFRA_DAMAGES_DF.iloc[idx,1+list(FRIMS_INFRA_DAMAGES_DF.columns).index(column)]
            
                
        for column in column_name:
            FRIMS_INFRA_DAMAGES_DF[column] = FRIMS_INFRA_DAMAGES_DF[column].astype(str)
            g = FRIMS_INFRA_DAMAGES_DF.groupby('District')[column].transform(lambda x: ' '.join(x))
            FRIMS_INFRA_DAMAGES_DF[column] = g
            
        
        FRIMS_INFRA_DAMAGES_DF_CLEANED = FRIMS_INFRA_DAMAGES_DF.drop_duplicates(subset='District',keep='first')
        
        FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')
        FRIMS_INFRA_DAMAGES_DF_CLEANED.reset_index(drop=True).to_csv(r'Data/Scraped Data/'+folder_slug+r'/'+folder_slug+'_'+str(date.replace('.','-'))+'.csv')
        
        print('----')

07.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  07.09.2022
----
RELIEF_CAMP_INMATES
No data for:  07.09.2022
----
06.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
27.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  27.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  27.08.2022
----
RELIEF_CAMP_INMATES
No data for:  27.08.2022
----
15.05.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
17.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  17.08.2022
----
RELIEF_CAMP_INMATES
No data for:  17.08.2022
----
18.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
11.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  11.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  11.09.2022
----
RELIEF_CAMP_INMATES
No data for:  11.09.2022
----
16.10.2022
POPULATION_AND_CROP_AREA_AFFECTED
----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
29.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  29.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  29.09.2022
----
RELIEF_CAMP_INMATES
No data for:  29.09.2022
----
20.10.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  20.10.2022
----
RELIEF_CAMP_INMATES
No data for:  20.10.2022
----
06.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  06.09.2022
----
RELIEF_CAMP_INMATES
No data for:  06.09.2022
----
09.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
No data for:  09.06.2022
----
11.10.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issues with cleaning and combining
----
RELIEF_CAMP_INMATES
No data for:  11.10.2022
----
30.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  30.08.2022
----
RELIEF_CAMP_INMATES
No data for:  30.08.2022
----
1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
04.10.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
No data for:  04.10.2022
----
RELIEF_CAMPS_OPENED
No data for:  04.10.2022
----
RELIEF_CAMP_INMATES
No data for:  04.10.2022
----
22.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
Issues with cleaning and combining
----
RELIEF_CAMPS_OPENED
No data for:  22.08.2022
----
RELIEF_CAMP_INMATES
No data for:  22.08.2022
----
31.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
24.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  24.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  24.08.2022
----
RELIEF_CAMP_INMATES
No data for:  24.08.2022
----
31.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  31.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  31.08.2022
----
RELIEF_CAMP_INMATES
No data for:  31.08.2022
----
23.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  23.08.2022
----
RELIEF_CAMP_INMATES
No data for:  23.08.2022
----
21.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
R

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
No data for:  30.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  30.09.2022
----
RELIEF_CAMP_INMATES
No data for:  30.09.2022
----
15.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
19.05.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
06.10.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
No data for:  06.10.2022
----
RELIEF_CAMPS_OPENED
No data for:  06.10.2022
----
RELIEF_CAMP_INMATES
No data for:  06.10.2022
----
27.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
18.09.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
No data for:  18.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  18.09.2022
----
RELIEF_CAMP_INMATES
No data for:  18.09.2022
----
28.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  28.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  28.08.2022
----
RELIEF_CAMP_INMATES
No data for:  28.08.2022
----
08.10.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  08.10.2022
----
RELIEF_CAMP_INMATES
No data for:  08.10.2022
----
15.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  15.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  15.09.2022
----
RELIEF_CAMP_INMATES
No data for:  15.09.2022
----
13.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  13.09.2022
----
RELIEF_CAMP_INMATES
No data for:  13.09.2022
----
28.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
08.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issues with clea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  16.09.2022
----
RELIEF_CAMP_INMATES
No data for:  16.09.2022
----
20.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  20.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  20.08.2022
----
RELIEF_CAMP_INMATES
No data for:  20.08.2022
----
15.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  15.08.2022
----
RELIEF_CAMP_INMATES
No data for:  15.08.2022
----
26.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issues with cleaning and combining
----
RELIEF_CAMP_INMATES
----
13.08.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  13.08.2022
----
RELIEF_CAMP_INMATES
No data for:  13.08.2022
----
14.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
Issue with table - Row header across multiple pages
----
14.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issue with table - Row header across multiple pages
----
RELIEF_CAMP_INMATES
----
19.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  19.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  19.09.2022
----
RELIEF_CAMP_INMATES
No data for:  19.09.2022
----
10.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  10.06.2022
----
RELIEF_CAMP_INMATES
No data for:  10.06.2022
----
16.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  16.08.2022
----
RELIEF_CAMP_INMATES
No data for:  16.08.2022
----
04.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
20.05.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
04.08.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
30.05.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issue with table - Row header across multiple pages
----
RELIEF_CAMP_INMATES
----
21.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  21.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  21.08.2022
----
RELIEF_CAMP_INMATES
No data for:  21.08.2022
----
18.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  18.08.2022
----
RELIEF_CAMP_INMATES
No data for:  18.08.2022
----
28.05.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
05.07.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
22.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  22.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  22.09.2022
----
RELIEF_CAMP_INMATES
No data for:  22.09.2022
----
26.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
No data for:  26.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  26.08.2022
----
RELIEF_CAMP_INMATES
No data for:  26.08.2022
----
27.05.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
29.08.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
No data for:  29.08.2022
----
RELIEF_CAMPS_OPENED
No data for:  29.08.2022
----
RELIEF_CAMP_INMATES
No data for:  29.08.2022
----
12.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  12.06.2022
----
RELIEF_CAMP_INMATES
No data for:  12.06.2022
----
30.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
20.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
07.10.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  07.10.2022
----
RELIEF_CAMP_INMATES
No data for:  07.10.2022
----
13.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
No data for:  13.06.2022
----
15.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
11.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
14.10.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
No data for:  14.10.2022
----
24.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
22.07.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
24.09.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  24.09.2022
----
RELIEF_CAMP_INMATES
No data for:  24.09.2022
----
28.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
05.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
10.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
15.10.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issues with cleaning and combining
----
RELIEF_CAMP_INMATES
No data for:  15.10.2022
----
18.06.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
12.09.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
No data for:  12.09.2022
----
RELIEF_CAMPS_OPENED
No data for:  12.09.2022
----
RELIEF_CAMP_INMATES
No data for:  12.09.2022
----
17.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
13.10.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
No data for:  13.10.2022
----
25.05.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
14.08.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  14.08.2022
----
RELIEF_CAMP_INMATES
No data for:  14.08.2022
----
20.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
08.08.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  08.08.2022
----
RELIEF_CAMP_INMATES
No data for:  08.08.2022
----
13.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
Issues with cleaning and combining
----
RELIEF_CAMP_INMATES
----
08.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
16.05.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
16.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
29.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
19.10.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
No data for:  19.10.2022
----
RELIEF_CAMP_INMATES
No data for:  19.10.2022
----
04.07.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
22.05.2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FRIMS_INFRA_DAMAGES_DF_CLEANED['Date'] = date.replace('.','-')


POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----
16.06.2022
POPULATION_AND_CROP_AREA_AFFECTED
----
RELIEF_CAMPS_OPENED
----
RELIEF_CAMP_INMATES
----


In [271]:
issues_df = pd.DataFrame(issues_dates).T
issues_df.columns = folder_slug_dict.values()
issues_df.to_csv('ISSUES.csv',index=False)

In [283]:
scraped_files_daily = glob.glob(r'Data/Scraped Data/RELIEF_CAMP_INMATES/*.csv')

dfs = []
for file in scraped_files_daily:
    df = pd.read_csv(file)
    df = df.iloc[:-1,:]
    dfs.append(df)
    
DF = pd.concat(dfs)
DF['Date'] = pd.to_datetime(DF['Date'],format='%d-%m-%Y')
DF = DF.sort_values(by='Date')
DF.drop_duplicates().to_csv('Data/Cleaned Data/RELIEF_CAMP_INMATES_MASTER_2022.csv')