In [1]:
import os
import csv
from os import path
from bs4 import BeautifulSoup

In [2]:
params_dir = 'parameter'
output_dir = 'panchayat_NREGA_data'

In [3]:
def readfile(filename):
    with open(filename) as f:
        content = f.readlines()
    return [x.strip() for x in content]

def checkfile(filename):
    return path.exists(filename)

In [4]:
hyperlink_file = "hyperlink.txt"
filenames_file = "savefiles.txt"
basenames_file = "base.txt"
parameter_file = "parameters.txt"

In [5]:
years = ["2014-2015","2015-2016","2016-2017","2017-2018","2018-2019","2019-2020","2020-2021"]

hyperlink_lines = readfile(path.join(params_dir, hyperlink_file))
parameter_lines = readfile(path.join(params_dir, parameter_file))
savedhtml_lines = readfile(path.join(params_dir, filenames_file))
basehtml_lines = readfile(path.join(params_dir, basenames_file))

except_list = []

In [None]:
# Helper methods
def printDict(dc):
    for k in dc.items():
        print(k)
        
def getnthchild(ini, n):
    a = ini
    for _ in range(int(n)):
        a = a.find_next_sibling("td")
    return a

def readfile(filename):
    with open(filename) as f:
        content = f.readlines()
    return [x.strip() for x in content]

def checkfile(filename):
    return path.exists(filename)

def checkName(s):
    return all(x.isalpha() or x.isspace() for x in s)

def dict_to_list(d, year):
    df_list = []
    header = ["state", "district", "block", "panchayat"] + year
    df_list.append(header)
    for key, value in d.items():
        col = [key[0], key[1], key[2], key[3]]
        for y in year:
            if y in value:
                col.append(float(value[y]))
            else:
                col.append(0.0)
        df_list.append(col)
    
    return df_list

Different methods for getting values for different parameters

In [6]:
# R 1.1.1
def getPanchayatTup111(fn, start_row):
    soup = BeautifulSoup(open(fn),'html.parser')
    try:
        ele = soup.find("span", {"id": "ContentPlaceHolder1_lbl_head"}).text
        state_name = ele.split(":")[1].replace("District", "").strip().lower()
        district_name = ele.split(":")[2].replace("Block", "").strip().lower()
        block_name = ele.split(":")[3].replace("Panchayat", "").strip().lower()
    except Exception as e:
        print("Exception", e)
        return []
    panchayats = soup.findAll("tr")[start_row:-2]
    res = []
    for panchayat in panchayats:
        try:
            panchayat_element = panchayat.find('td').find_next_sibling("td")
            panchayat_name = panchayat_element.text.strip().lower()
            total_td_elements = panchayat.findAll('td')[2:]
            ls = [state_name, district_name, block_name, panchayat_name]
            for td_element in total_td_elements:
                val = td_element.text.strip()
                ls.append(val)
            res.append(ls)
        except Exception as e:
            print("Exception", e)
    return res

# R 14.1
def getPanchayatTup141(fn, start_row):
    soup = BeautifulSoup(open(fn),'html.parser')
    try:
        ele = soup.find("span", {"id": "ContentPlaceHolder1_lbl_head"}).text
        state_name = ele.split(":")[1].replace("District", "").strip().lower()
        district_name = ele.split(":")[2].replace("Block", "").strip().lower()
        block_name = ele.split(":")[3].replace("Panchayat", "").strip().lower()
    except Exception as e:
        print("Exception", e)
        return []
    panchayats = soup.findAll("tr")[start_row:-1]
    res = []
    for panchayat in panchayats:
        try:
            panchayat_element = panchayat.find('td').find_next_sibling("td")
            panchayat_name = panchayat_element.text.strip().lower()
            total_td_elements = panchayat.findAll('td')[2:]
            ls = [state_name, district_name, block_name, panchayat_name]
            for td_element in total_td_elements:
                val = td_element.text.strip()
                ls.append(val)
            res.append(ls)
        except Exception as e:
            print("Exception", e)
    return res


# R 5.1.1
def getPanchayatTup511(fn, start_row):
    soup = BeautifulSoup(open(fn),'html.parser')
    try:
        ele = soup.find("span", {"id": "ContentPlaceHolder1_lbl_head"}).text
        state_name = ele.split(":")[1].replace("District", "").strip().lower()
        district_name = ele.split(":")[2].replace("Block", "").strip().lower()
        block_name = ele.split(":")[3].replace("Panchayat", "").strip().lower()
    except Exception as e:
        print("Exception in getting names", e)
        return []
    panchayats = soup.findAll("tr")[start_row:-1]
    res = []
    for panchayat in panchayats:
        try:
            panchayat_element = panchayat.find('td').find_next_sibling("td")
            panchayat_name = panchayat_element.text.strip().lower()
            total_td_elements = panchayat.findAll('td')[2:]
            ls = [state_name, district_name, block_name, panchayat_name]
            for td_element in total_td_elements:
                val = td_element.text.strip()
                ls.append(float(val))
            res.append(ls)
        except Exception as e:
            print("Exception above 1: ", e)
    return res


# R 7.2.1
def getPanchayatTup721(fn, start_row):
    soup = BeautifulSoup(open(fn),'html.parser')
    try:
        ele = soup.find("span", {"id": "ContentPlaceHolder1_lbl_head"}).text
        state_name = ele.split(":")[1].replace("District", "").strip().lower()
        district_name = ele.split(":")[2].replace("Block", "").strip().lower()
        block_name = ele.split(":")[3].replace("Panchayat", "").strip().lower()
    except Exception as e:
        print("Exception", e)
        return []
    panchayats = soup.findAll("tr")[start_row:-1]
    res = []
    for panchayat in panchayats:
        try:
            panchayat_element = panchayat.find('td').find_next_sibling("td")
            snum = panchayat.find('td').text
            panchayat_name = panchayat_element.text.strip().lower()
            total_td_elements = panchayat.findAll('td')[2:]
            ls = [state_name, district_name, block_name, panchayat_name]
            for td_element in total_td_elements:
                val = td_element.text.strip()
                ls.append(float(val))
            res.append(ls)
        except Exception as e:
            print("Exception", e)
    return res

In [8]:
dict_list = []

for i in range(len(hyperlink_lines)):

    # Getting the url
    url = hyperlink_lines[i]
    
    # Getting the parameter name
    param = savedhtml_lines[i]
    parameters = parameter_lines[i].split(",")
    num_parameters = len(parameters)

    # Intializing the dictionary list
    dict_list = [dict() for _ in range(len(parameters))]

    # Getting the parameter directory
    param_dir = path.join("html", param)
    print(param)

    # If the directory for param doesn't exist
    if not path.exists(param_dir):
        print("Param directory ", param_dir, " does not exist.")

    # Iterating through the years
    for year in years:
        # Checking if the directory for year exist
        year_dir = path.join(param_dir, year)
        if not path.exists(year_dir):
            print("Year directory ", year_dir, " does not exist.")
            
        year_url = url + year

        # Getting the states from the dir
        states = [x for x in os.listdir(year_dir)]

        # Iterate through the states
        for state in states:
            try:
                print("State: ", state)
                state_dir = os.path.join(year_dir, state)

                # Getting the districts from the state directory
                districts = [x for x in os.listdir(state_dir)]

                # Iterating over each of the districts
                for district in districts:
                    try:
                        district_dir = os.path.join(state_dir, district)

                        # Getting the blocks from the district directory
                        blocks = [x for x in os.listdir(district_dir)]

                        # Iterating over each of the blocks
                        for block in blocks:
                            try:
                                # Some were having empty names with just .html, skipped those as those were corrupted files
                                if block == ".html":
                                    continue

                                # Getting the block file
                                block_file = os.path.join(district_dir, block)
                                panchayatList = []

                                # Based on the parameter, parsing the file through respective method which returns a list containing panchayats with metadata
                                if param == 'R111':
                                    panchayatList = getPanchayatTup111(block_file, 10)
                                elif param == 'R141':
                                    panchayatList = getPanchayatTup141(block_file, 10)
                                elif param == 'R511':
                                    panchayatList = getPanchayatTup511(block_file, 9)
                                else:
                                    panchayatList = getPanchayatTup721(block_file, 5)

                                # Iterating over the panchayats in the list
                                for panchayat in panchayatList:
                                    # Iterating till length - 4 because first 4 are (panchayat, block, district, state) and these we are using as a key only.
                                    for k in range(0, len(panchayat) - 4):
                                        if (panchayat[0], panchayat[1], panchayat[2], panchayat[3]) not in dict_list[k].keys():
                                            dict_list[k][(panchayat[0], panchayat[1], panchayat[2], panchayat[3])] = dict()
                                        
                                        # storing the values
                                        dict_list[k][(panchayat[0], panchayat[1], panchayat[2], panchayat[3])][year] = panchayat[k + 4]
                                        
                            except Exception as e:
                                print("Exception below 1: ", e, " Block name: ", os.path.join(district_dir, block), " Panchayat list: ", panchayatList)
                    except Exception as e:
                        print("Exception below 2: ", e)
            except Exception as e:
                print("Exception below 3: ", e)

    # Creating the csv for the parameters
    for k in range(len(dict_list)):
        csv_name = path.join(output_dir, savedhtml_lines[i] + "_" + parameters[k] + ".csv")
        trr = dict_to_list(dict_list[k], years)
        with open(csv_name, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(trr)


R111
State:  BIHAR
State:  RAJASTHAN
State:  KARNATAKA
State:  GOA
State:  MEGHALAYA
State:  ASSAM
State:  DAMAN & DIU
State:  MIZORAM
State:  TELANGANA
State:  JHARKHAND
State:  JAMMU AND KASHMIR
State:  ODISHA
State:  WEST BENGAL
State:  TRIPURA
State:  HARYANA
State:  PUDUCHERRY
State:  GUJARAT
State:  MADHYA PRADESH
State:  KERALA
State:  HIMACHAL PRADESH
State:  ANDAMAN AND NICOBAR
State:  MANIPUR
State:  UTTAR PRADESH
State:  PUNJAB
State:  NAGALAND
State:  ANDHRA PRADESH
State:  SIKKIM
State:  MAHARASHTRA
State:  LAKSHADWEEP
State:  LADAKH
State:  ARUNACHAL PRADESH
State:  DN HAVELI AND DD
State:  UTTARAKHAND
State:  TAMIL NADU
State:  CHHATTISGARH
Exception 'NoneType' object has no attribute 'text'
Exception 'NoneType' object has no attribute 'text'
Exception 'NoneType' object has no attribute 'text'
State:  BIHAR
State:  RAJASTHAN
State:  KARNATAKA
State:  GOA
State:  MEGHALAYA
State:  ASSAM
State:  DAMAN & DIU
State:  MIZORAM
State:  TELANGANA
State:  JHARKHAND
State:  JAMMU 