In [1]:
import pygsheets
import re
import pandas as pd
from fuzzywuzzy import fuzz
import nltk
import numpy as np
import geopandas as gpd
from difflib import SequenceMatcher
import dateutil.parser



In [2]:
google_client = pygsheets.authorize(service_file='idea-frm-500603810b1a.json')

In [3]:
#A function that gives matching score between two strings a and b. Higher the score,better the match
def similar(row_of_df):
    '''
    :param row_of_df: A row from a pandas dataframe that contains strings that have to be compared
    
    :return: Matching score between two strings. Higher the score,better the match
    '''
    return SequenceMatcher(None, row_of_df[0], row_of_df[1]).ratio()

In [4]:
ASSAM_VILLAGES = gpd.read_file(r'D:/Projects/assam-tender-scraper/Assam_shapefiles/assam_village_complete_with_revenueCircle_district_35_oct2022.geojson',driver='GeoJSON')
ASSAM_VILLAGES_POINT = gpd.read_file('D:/Projects/assam-tender-scraper/Assam_shapefiles/census_village_point_assam.geojson', driver='GeoJSON')
ASSAM_CITIES = gpd.read_file('D:/Projects/assam-tender-scraper/Assam_shapefiles/assam_city.geojson',driver='GeoJSON')

In [5]:
ASSAM_VILLAGES.shape

(21026, 9)

In [6]:
ASSAM_VILLAGES.groupby(['district','sdtname_2','VILNAM_SOI'])[['OBJECTID']].count().reset_index().sort_values(by='OBJECTID',ascending=False)[404:]

Unnamed: 0,district,sdtname_2,VILNAM_SOI,OBJECTID
17937,SONITPUR,Tezpur,BARUADALANI PATHAR,2
14010,LAKHIMPUR,Narayanpur,MATIA NO 2,1
13990,LAKHIMPUR,Narayanpur,KAMARBARI,1
13732,LAKHIMPUR,Kadam,SARU DIRJU,1
14006,LAKHIMPUR,Narayanpur,MARASUTI NO 1,1
...,...,...,...,...
6908,DIBRUGARH,Moran,NAOKATAGAON,1
6907,DIBRUGARH,Moran,NAMDANG RF,1
6906,DIBRUGARH,Moran,NALANIDOLONI NO 2,1
6905,DIBRUGARH,Moran,NAKHAL SONOWALGAON,1


## Raw Dataset

In [7]:
input_df = pd.read_csv('InputData/consolidated_v2.csv')
input_df = input_df.drop_duplicates()
input_df.shape

(44615, 29)

In [8]:
input_df.isnull().sum()

Tender ID                                0
tender_externalreference                 0
tender_title                             0
Work Description                         0
Tender Category                          0
Tender Type                              0
Form of Contract                         0
Product Category                         0
Fiscal Year                              0
Is Multi Currency Allowed For BOQ        0
Two Stage Tender (Y/N)                   0
Independent External Monitor         44131
Published Date                           0
PreBid Meeting Date                  31729
Tender Validity in Days                  0
NDA Tender (Y/N)                         0
Preferential Bidding allowed             0
Payment Mode                             0
Bid Opening Date                         0
Organisation Chain                       0
Department                               0
location                                 0
Pincode                                  0
No of Bids 

# Table of Contents


1. [De-Duplication](#deduplication)
2. [Identify flood related tenders](#floodrelated)
3. [Identify Season of tenders](#season)
4. [Identify Scheme related information](#scheme)
5. [Identify Response Type of the tender](#response)
6. [GEOCODING: Find District of the tender](#district)
7. [GEOCODING: Find Revenue circle, block and village of tender](#revcircle)

## De-Duplication <a class="anchor" id="deduplication"></a>

In [11]:
tender_ids = input_df["Tender ID"]
duplicates_df = input_df[tender_ids.isin(tender_ids[tender_ids.duplicated()])].sort_values("Tender ID")
input_df = input_df.drop(duplicates_df[duplicates_df['No of Bids Received'].isnull()].index)
input_df.reset_index(drop=True, inplace=True)

#Not an ideal way to drop remaining duplicate tenders, need discussion with OCI team
deduped_df = input_df.drop_duplicates(subset=['Tender ID'],keep='last')

In [12]:
deduped_df.to_csv("OutputData/deduped_master_tender_list.csv", encoding='utf-8')

## Identify flood related tenders using Keywords  <a class="anchor" id="floodrelated"></a>

In [13]:
def populate_keyword_dict(keyword_list): 
    keywords_dict = {}
    for keyword in keyword_list:
        keywords_dict[keyword] = 0
    return keywords_dict

In [14]:
def flood_filter(row):
    '''
    :param row: row of the dataframe that contains tender title, work description
    
    :return: Tuple of (is_flood_tender, positive_kw_dict, negative_kw_dict) for every row
    '''
    positive_keywords_dict = populate_keyword_dict(POSITIVE_KEYWORDS)
    negative_keywords_dict = populate_keyword_dict(NEGATIVE_KEYWORDS)
    tender_slug = str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    is_flood_tender = False
    for keyword in POSITIVE_KEYWORDS:
        keyword_count = len(re.findall(r"\b%s\b" % keyword.lower(), tender_slug.lower()))
        positive_keywords_dict[keyword] = keyword_count
        if keyword_count > 0:
            is_flood_tender = True
            
    for keyword in NEGATIVE_KEYWORDS:
        keyword_count = len(re.findall(r"\b%s\b" % keyword.lower(), tender_slug.lower()))
        negative_keywords_dict[keyword] = keyword_count
        if keyword_count > 0:
            is_flood_tender = False
           
    return str(is_flood_tender), str(positive_keywords_dict), str(negative_keywords_dict)

In [15]:
#Flood Keywords
global POSITIVE_KEYWORDS
POSITIVE_KEYWORDS = ['Flood', 'Embankment', 'embkt', 'Relief', 'Erosion', 'SDRF', 'Inundation', 'Hydrology',
                   'Silt', 'Siltation', 'Bund', 'Trench', 'Breach', 'Culvert', 'Sluice', 'Dyke',
                    #'Bridge', 'Road', "River","Drain",
                   'Storm water drain','Emergency','Immediate', 'IM', 'AE','A E', 'AAPDA MITRA']
global NEGATIVE_KEYWORDS
NEGATIVE_KEYWORDS = ['Floodlight', 'Flood Light','GAS', 'FIFA', 'pipe','pipes', 'covid']

In [16]:
flood_filter_tuples = deduped_df.apply(flood_filter,axis=1)
deduped_df.loc[:,'is_flood_tender'] = [var[0] for var in list(flood_filter_tuples)]
deduped_df.loc[:,'positive_keywords_dict'] = [var[1] for var in list(flood_filter_tuples)]
deduped_df.loc[:,'negative_keywords_dict'] = [var[2] for var in list(flood_filter_tuples)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [17]:
# Removing tenders from certain departments that are not related to flood management.
idea_frm_tenders_df = deduped_df[(deduped_df.is_flood_tender=='True')&
                                 (~deduped_df.Department.isin(["Directorate of Agriculture and Assam Seed Corporation","Department of Handloom Textile and Sericulture"]))]

In [18]:
idea_frm_tenders_df.shape

(4396, 32)

## Classification of Tenders based on Monsoon <a class="anchor" id="season"></a>

In [19]:
for index, row in idea_frm_tenders_df.iterrows():
    monsoon = "" 
    published_date = dateutil.parser.parse(row['Published Date'])
    if 1 <= published_date.month <= 5:
        monsoon = "Pre-Monsoon"
        if published_date.month == 5 and published_date.day > 14:
            monsoon = "Monsoon"
    elif 6 <= published_date.month <= 10:
        monsoon = "Monsoon"
        if published_date.month == 10 and published_date.day > 14:
            monsoon = "Post-Monsoon"
    else:
        monsoon = "Post-Monsoon"
    idea_frm_tenders_df.loc[index, "Season"] = monsoon

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


## Schemes <a class="anchor" id="scheme"></a>

In [20]:
def extract_schemes(df,scheme_kw):
    '''
    :param df: Dataframe from which information has to be extracted.
    :param scheme_kw: Set of Schemes in lower case.
    
    :return schemes_identified: List of schemes identified for each row.
    '''
    schemes_identified = []
    
    for idx, row in df.iterrows():
        tender_slug = row['tender_title']+' '+row['tender_externalreference']+' '+row['Work Description']
        tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug).lower()

        tender_slug = set(re.split(r'[-.,()_\s/]\s*',tender_slug))
        try:
            schemes_identified.append(list(tender_slug & scheme_kw)[0].upper())
        except:
            schemes_identified.append('')
            
    return schemes_identified


In [21]:
idea_frm_tenders_df.loc[:,'Scheme'] = extract_schemes(idea_frm_tenders_df,{'ridf','sdrf','sopd','cidf','ltif'})

## Classification of Tenders based on Response type <a class="anchor" id="response"></a>

In [22]:
IMMEDIATE_MEASURES_KEYWORDS = ['sdrf','im','i/m','gr','g/r','relief','package','pkt','immediate']
PREPAREDNESS_MEASURES_KEYWORDS = ['protection','new', 'reconstruction', 'constn' ,'recoupment',
                                  'restoration', 'embankment', 'embkt',
                      'dyke','culvert','storm water', 'drainage','drain',
                                  'drains','box','rcc','silt','desiltation','prosiltation','anti erosion',
                      'erosion','a/e','ae','a e','bank protection','bank breach',
                                  'breach','sludging','desludging','sluice','bund','bundh',
                      'dam','canal','road','roads','bridge','bridges','data','drone','rescue',
                                  'consultation','advisory','consult','study']

In [23]:
for index, row in idea_frm_tenders_df.iterrows():
    immedidate_measures_dict = populate_keyword_dict(IMMEDIATE_MEASURES_KEYWORDS)
    preparedness_measures_dict = populate_keyword_dict(PREPAREDNESS_MEASURES_KEYWORDS)
    response_type = "Others"
    tender_slug = str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for keyword in immedidate_measures_dict:
        keyword_count = len(re.findall(r"\b%s\b" % keyword.lower(), tender_slug.lower()))
        immedidate_measures_dict[keyword] = keyword_count
        if not keyword_count:
            immedidate_measures_dict[keyword] =  False
        else:
            response_type = "Immediate Measures"
    
    for keyword in preparedness_measures_dict:
        keyword_count = len(re.findall(r"\b%s\b" % keyword.lower(), tender_slug.lower()))
        preparedness_measures_dict[keyword] = keyword_count
        if not keyword_count:
            preparedness_measures_dict[keyword] =  False
        elif response_type == "Others":
            response_type = "Preparedness Measures"
    idea_frm_tenders_df.loc[index, "Response Type"] = response_type
    
    if response_type == "Immediate Measures":
        sub_head_dict = {k: v for k, v in immedidate_measures_dict.items() if v is not False}
        idea_frm_tenders_df.loc[index, "Flood Response - Subhead"] = str(sub_head_dict)
    elif response_type == "Preparedness Measures":
        sub_head_dict = {k: v for k, v in preparedness_measures_dict.items() if v is not False}
        idea_frm_tenders_df.loc[index, "Flood Response - Subhead"] = str(sub_head_dict)  

In [24]:
idea_frm_tenders_df.to_csv("OutputData/IDEA-FRM_filtered_tenders_with_metadata.csv", encoding='utf-8')

## GEO-CODING TENDERS - Find Districts <a class="anchor" id="district"></a>

### Clean geospatial data

In [None]:
idea_frm_tenders_df = pd.read_csv('OutputData/IDEA-FRM_filtered_tenders_with_metadata.csv')
assam_villages = ASSAM_VILLAGES["VILNAM_SOI"]
village_duplicates_df = ASSAM_VILLAGES[assam_villages.isin(assam_villages[assam_villages.duplicated()])].sort_values("VILNAM_SOI")

In [42]:
VILLAGE_CORRECTION_DICT = {
    "SOKARBILA(BOLGARBARI)(DARIAPAR" : "SOKARBILA(BOLGARBARI)(DARIAPAR)",
    "MANGALDAI EXTENDED TOWN (BHEBA" : "MANGALDAI EXTENDED TOWN (BHEBA)",
    "UPPER DIHING R.F. (SOUTH BLOCK" : "UPPER DIHING R.F. (SOUTH BLOCK)",
    "KACHARI MAITHCHAGAON NO.1(BAR" : "KACHARI MAITHCHAGAON NO.1(BAR)",
}

In [43]:
ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\(Pt\)','')
ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\(Pt-I\)','')
ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\(Pt-II\)','')
ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\n',' ')
ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.strip()

  ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\(Pt\)','')
  ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\(Pt-I\)','')
  ASSAM_VILLAGES.revenue_ci = ASSAM_VILLAGES.revenue_ci.str.replace('\(Pt-II\)','')


In [None]:
locations = []
for idx, row in idea_frm_tenders_df.iterrows():
    LOCATION = row['location'].lower()
    LOCATION = LOCATION.replace('village','')
    LOCATION = LOCATION.replace('district','')
    LOCATION = LOCATION.replace('dist','')
    LOCATION = re.sub('[^a-zA-Z\n\.]', ' ', LOCATION)
    scores = []
    for revenue_circle in ASSAM_VILLAGES.revenue_ci.dropna().unique():
        score = SequenceMatcher(None, LOCATION, revenue_circle.lower().strip()).ratio()
        scores.append(score)
    if max(scores)>0.8:
        locations.append(ASSAM_VILLAGES.revenue_ci.dropna().unique()[scores.index(max(scores))])
    else:
        locations.append(row['location'])

In [None]:
idea_frm_tenders_df.location = locations
idea_frm_tenders_df.to_csv("OutputData/IDEA-FRM_filtered_tenders_with_metadata_LOCATIONSTD.csv", encoding='utf-8')

In [44]:
ASSAM_VILLAGES.sdtname_2 = ASSAM_VILLAGES.sdtname_2.str.replace('\(Pt\)','')
ASSAM_VILLAGES.sdtname_2 = ASSAM_VILLAGES.sdtname_2.str.replace('\(Pt-I\)','')
ASSAM_VILLAGES.sdtname_2 = ASSAM_VILLAGES.sdtname_2.str.replace('\(Pt-II\)','')

  ASSAM_VILLAGES.sdtname_2 = ASSAM_VILLAGES.sdtname_2.str.replace('\(Pt\)','')
  ASSAM_VILLAGES.sdtname_2 = ASSAM_VILLAGES.sdtname_2.str.replace('\(Pt-I\)','')
  ASSAM_VILLAGES.sdtname_2 = ASSAM_VILLAGES.sdtname_2.str.replace('\(Pt-II\)','')


In [45]:
rev_circles = ASSAM_VILLAGES[["revenue_ci",'district_2']].drop_duplicates().dropna()
#These revenue circles are across multiple districts
problematic_rev_circles = rev_circles[rev_circles.duplicated(['revenue_ci'],keep=False)].sort_values('revenue_ci')
problematic_rev_circles.revenue_ci.unique()

array(['Baganpara', 'Bagribari', 'Bajali', 'Barnagar', 'Chapar',
       'Dalgaon', 'Dhakuakhana', 'Dhekiajuli', 'Dhubri', 'Ghograpar',
       'Golokganj', 'Gossaigaon', 'Jalah', 'Khoirabari', 'Kokrajhar',
       'Lakhipur', 'Mangaldoi', 'North Guwahati', 'Pathorighat', 'Rangia',
       'Sarupeta', 'Sidli', 'Subansiri'], dtype=object)

In [46]:
ASSAM_VILLAGES.district_2.unique()
#KOKRAJHAR, DHUBRI, BAJALI -- these districts may match faultly with the problematic revenue circles.
#KAMRUP and KAMRUP METRO can get mismatched

array(['KOKRAJHAR', 'DHUBRI', 'GOALPARA', 'BARPETA', 'BAJALI', 'MORIGAON',
       'NAGAON', 'SONITPUR', 'LAKHIMPUR', 'DHEMAJI', 'TINSUKIA',
       'DIBRUGARH', 'SIVSAGAR', 'JORHAT', 'GOLAGHAT', 'K.ANGLONG',
       'DIMA HASAO', 'CACHAR', 'KARIMGANJ', 'HAILAKANDI', 'BONGAIGAON',
       'CHIRANG', 'KAMRUP', 'KAMRUP METRO', 'NALBARI', 'BAKSA',
       'TAMULPUR', 'DARRANG', 'UDALGURI', 'CHARAIDEO', 'BISWANATH',
       'HOJAI', 'WEST KARBI ANGLONG', 'SOUTH SALMARA MANCACHAR', 'MAJULI',
       None], dtype=object)

In [47]:
sdts= ASSAM_VILLAGES[["sdtname_2",'district_2']].drop_duplicates().dropna()

#These revenue circles are across multiple districts
problematic_sdts = sdts[sdts.duplicated(['sdtname_2'],keep=False)].sort_values('sdtname_2')
problematic_sdts.sdtname_2.unique()

array(['Baganpara ', 'Bagribari ', 'Bajali ', 'Banekuchi', 'Barnagar ',
       'Bengtol', 'Bijni ', 'Chandrapur', 'Chapar ', 'Dalgaon ',
       'Dhakuakhana ', 'Dhekiajuli ', 'Dhubri ', 'Golokganj ',
       'Goreswar ', 'Gossaigaon ', 'Jalah ', 'Jorhat West', 'Kaliabor',
       'Khoirabari ', 'Kokrajhar ', 'Lakhipur', 'Mangaldoi ', 'Palasbari',
       'Pathorighat ', 'Sarupeta ', 'Sidli ', 'Subansiri '], dtype=object)

### FIND DISTRICTS OF TENDERS:
1. MAKE A DICTIONARY OF ONLY NON-REPEATED REVENUE CIRCLES, SUB-DISTRICTS, BLOCKS and VILLAGES MAPPED TO THEIR DISTRICTS
2. FORCE FIT DUPLICATE REVENUE CIRCLES IN DISTRICTS BASED ON CONTEXTUAL KNOWLEDGE
3. MAKE LIST OF DISTRICTS, REVENUE CIRCLES, SUB-DISTRICTS, BLOCKS and VILLAGES WITH NON-REPEATING NAMES
4. CREATE A DICTIONARY OF DISTRICT IDENTIFIERS FROM externalReference COLUMN

#### METHOD-1 to find district
1. FIND THE DISTRICT NAME PRESENT IN TENDER SLUG. CUSTOMISE HOW THE TENDER SLUG IS NEEDED. 
2. FIND THE REVENUE CIRCLE PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT. 
3. FIND THE DISTRICT USING DISTRICT IDENTIFIERS DICTIONARY.
4. FIND THE SUB-DISTRICT PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.
5. FIND THE BLOCK PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.
6. FIND THE VILLAGE PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.

#### METHOD-2 to find district (Weightage Method)
1. GET TENDER DISTRICT BASED ON externalReference COLUMN
2. GET TENDER DISTRICT BASED ON TITLE AND WORK DESCRIPTION
3. GET TENDER DISTRICT BASED ON LOCATION COLUMN
4. BTC FLAG
5. WEIGHTAGE LOGIC

In [None]:
# UNUSED FUNCTION - WILL BE USEFUL IF PARTIAL MATCH IS ALLOWED.
def re_partial_find(word_to_match,string,match_score_threshold=0.85):
    word_to_match = word_to_match.lower()
    word_to_match = word_to_match.strip()
    string = string.lower()
    string = re.sub('[^a-zA-Z0-9 \n\.]', ' ', string)
    string_list = string.split(' ')
    
    for substring in string_list:
        match_score = SequenceMatcher(None, word_to_match, substring).ratio()
        if match_score > match_score_threshold:
            return word_to_match
    return False

In [None]:
#MAKE A DICTIONARY OF ONLY NON-REPEATED REVENUE CIRCLES, SUB-DISTRICTS, BLOCKS and VILLAGES MAPPED TO THEIR DISTRICTS
assam_revenue_circles_dict = ASSAM_VILLAGES[['revenue_ci','district_2']].dropna().drop_duplicates().drop_duplicates(['revenue_ci'],keep=False).set_index('revenue_ci').to_dict(orient='index')
assam_subdist_dict = ASSAM_VILLAGES[['sdtname_2','district_2']].dropna().drop_duplicates().drop_duplicates(['sdtname_2'],keep=False).set_index('sdtname_2').to_dict(orient='index')

assam_blocks_dict = ASSAM_VILLAGES[['block_name','district_2']].dropna().drop_duplicates().drop_duplicates(['block_name'],keep=False).set_index('block_name').to_dict(orient='index')
assam_villages_dict = ASSAM_VILLAGES[['VILNAM_SOI','district_2']].drop_duplicates(['VILNAM_SOI'],keep=False).set_index('VILNAM_SOI').to_dict(orient='index')


In [None]:
#Force fit duplicate revenue circles in districts
assam_revenue_circles_dict['Baganpara']={'district_2': 'BAKSA'}
assam_revenue_circles_dict['Bagribri']={'district_2': 'DHUBRI'}
assam_revenue_circles_dict['Bajali']={'district_2': 'BAJALI'}
assam_revenue_circles_dict['Barnagar']={'district_2': 'BAKSA'}
assam_revenue_circles_dict['Chapar']={'district_2': 'DHUBRI'}
assam_revenue_circles_dict['Dalgaon']={'district_2': 'DARRANG'}
assam_revenue_circles_dict['Dhakuakhana']={'district_2': 'LAKHIMPUR'}
assam_revenue_circles_dict['Dhekiajuli']={'district_2': 'SONITPUR'}
assam_revenue_circles_dict['Dhubri']={'district_2': 'DHUBRI'}
assam_revenue_circles_dict['Ghograpar']={'district_2': 'NALBARI'}
assam_revenue_circles_dict['Golokganj']={'district_2': 'DHUBRI'}
assam_revenue_circles_dict['Gossaigaon']={'district_2': 'KOKRAJHAR'}
assam_revenue_circles_dict['Jalah']={'district_2': 'BAKSA'}
assam_revenue_circles_dict['Khoirabari']={'district_2': 'UDALGURI'}
assam_revenue_circles_dict['Kokrajhar']={'district_2': 'KOKRAJHAR'}
assam_revenue_circles_dict['Lakhipur']={'district_2': 'GOALPARA'}
assam_revenue_circles_dict['Mangaldoi']={'district_2': 'DARRANG'}
assam_revenue_circles_dict['Pathorighat']={'district_2': 'DARRANG'}
assam_revenue_circles_dict['Sarupeta']={'district_2': 'BAJALI'}
assam_revenue_circles_dict['Sidli']={'district_2': 'CHIRANG'}
assam_revenue_circles_dict['Subansiri']={'district_2': 'LAKHIMPUR'}
assam_revenue_circles_dict['Rangia']={'district_2': 'KAMRUP'}

In [None]:
#MAKE LIST OF DISTRICTS, REVENUE CIRCLES, SUB-DISTRICTS, BLOCKS and VILLAGES WITH NON-REPEATING NAMES
problematic_rev_circlesUPPERCASE = [] #Empty after forcefitted. #[rc.upper().strip() for rc in problematic_rev_circles.revenue_ci.unique()]
problematic_sdtsUPPERCASE = [sdt.upper().strip() for sdt in problematic_sdts.sdtname_2.unique()]
assam_villages = list(set(assam_villages_dict.keys())-set(problematic_rev_circlesUPPERCASE)-set(problematic_sdtsUPPERCASE))
assam_blocks = list(set(assam_blocks_dict.keys())-set(problematic_rev_circlesUPPERCASE)-set(problematic_sdtsUPPERCASE))
assam_districts = list(set(ASSAM_VILLAGES.district_2.dropna())-set(['KAMRUP','KAMRUP METRO']))

assam_revenue_circles = list(set(assam_revenue_circles_dict.keys()))#-set(problematic_rev_circles.revenue_ci.unique())-set(problematic_sdts.sdtname_2.unique()))
assam_sub_districts = list(set(assam_subdist_dict.keys())-set(problematic_rev_circles.revenue_ci.unique())-set(problematic_sdts.sdtname_2.unique()))

In [None]:
# CREATE A DICTIONARY OF DISTRICT IDENTIFIERS FROM externalReference COLUMN
three_letter_distirct_identifiers_dict = {"bak":"BAKSA", "baksa":"BAKSA",
                                          "bar":"BARPETA", "re-bar": "BARPETA", "barpeta":"BARPETA",
                                          "bongaigoan":"BONGAIGAON",
                                          "tez":"SONITPUR","re-tez":"SONITPUR","tezpur":"SONITPUR","tej":"SONITPUR","re-tej":"SONITPUR",
                                          "silchar":"CACHAR", "re-silchar":"CACHAR","resilchar":"CACHAR","re-sil(mech)":"CACHAR","silchar (mech)":"CACHAR","sil":"CACHAR","sil (mech)":"CACHAR","sil(mech)":"CACHAR",
                                          "dhubri":"DHUBRI", "dhu": "DHUBRI",
                                          "siv":"SIVSAGAR","sivsagar":"SIVSAGAR","re-siv":"SIVSAGAR","sivasagar":"SIVSAGAR",
                                          "chirang":"CHIRANG",
                                          "mang":"DARRANG","re-mang":"DARRANG","mangaldai":"DARRANG","mangaldoi":"DARRANG",
                                          "dhe":"DHEMAJI","dhemaji":"DHEMAJI","dmj":"DHEMAJI","redhemaji":"DHEMAJI",
                                          "hailakandi":"HAILAKANDI","hkd":"HAILAKANDI","re-hailakandi":"HAILAKANDI",
                                          "dib-west":"DIBRUGARH","dib":"DIBRUGARH","dibrugarh":"DIBRUGARH","redib":"DIBRUGARH",
                                          "dima-hasao":"DIMA HASAO","haf":"DIMA HASAO","haflong":"DIMA HASAO",
                                          "goalpara":"GOALPARA","GLP":"GOALPARA",
                                          "diphu":"K.ANGLONG","rediphu":"K.ANGLONG",
                                          "jor":"JORHAT","jorhat":"JORHAT",
                                          "nag":"NAGAON","re-nag":"NAGAON","nagaon":"NAGAON","hatimura":"NAGAON",
                                          "nal":"NALBARI","nalbari":"NALBARI",
                                          "morigaon":"MORIGAON","mor":"MORIGAON","re-mor":"MORIGAON",
                                          "maj":"MAJULI","re-maj":"MAJULI","maju":"MAJULI","majuli":"MAJULI",
                                          "n.lakhimpur":"LAKHIMPUR","dhakuakhana":"LAKHIMPUR","nlp":"LAKHIMPUR","nl":"LAKHIMPUR","dhk":"LAKHIMPUR",
                                          "kar":"KARIMGANJ","rekar":"KARIMGANJ","re-kar":"KARIMGANJ","karimganj":"KARIMGANJ","badarpur":"KARIMGANJ",
                                          "gmda":"KAMRUP METRO","ghy east":"KAMRUP METRO","ghy.east":"KAMRUP METRO","ghy. east":"KAMRUP METRO","ghyeast":"KAMRUP METRO","g.east":"KAMRUP METRO","ghy east":"KAMRUP METRO","ghy west":"KAMRUP METRO","ge":"KAMRUP METRO","ghy.west":"KAMRUP METRO","ghy. west":"KAMRUP METRO","ghywest":"KAMRUP METRO",
                                          "kok":"KOKRAJHAR",
                                          "rangia":"KAMRUP",
                                         }

In [None]:
# METHOD-1

In [None]:
# FIND THE DISTRICT NAME PRESENT IN TENDER SLUG. CUSTOMISE HOW THE TENDER SLUG IS NEEDED. 
idea_frm_tenders_df['tender_district'] = None
idea_frm_tenders_df['explain_geocode'] = None
for idx, row in idea_frm_tenders_df.iterrows():
    tender_slug = str(row['location'])+ ' ' + str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    for district in assam_districts:
        if re.findall(r'\b%s\b'%district.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = district
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'District: '+ str(re.findall(r'\b%s\b'%district.lower().strip(), tender_slug.lower()))
            break

In [None]:
idea_frm_tenders_df.tender_district.dropna().shape
# 2463 tenders are already mapped to the districts. Let's check remaining tenders with absolyte revenue circles and sub-districts

In [None]:
#FIND THE REVENUE CIRCLE PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.

## PRIORITISE LOCATION COLUMN FIRST
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    
    tender_slug = str(row['location'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for revenue_circle in assam_revenue_circles:
        if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = assam_revenue_circles_dict[revenue_circle]['district_2']
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'Rev-Circle: '+ str(re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()))
            break
            
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    
    tender_slug = str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for revenue_circle in assam_revenue_circles:
        if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = assam_revenue_circles_dict[revenue_circle]['district_2']
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'Rev-Circle: '+ str(re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()))
            break

In [None]:
idea_frm_tenders_df.tender_district.dropna().shape
# 3744 tenders are already mapped to the districts. Let's check remaining tenders through absolute blocks

In [None]:
# USE THE DISTRICT IDENTIFIERS DICTIONARY TO FIND DISTRICT.
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    
    district_identifier = str(row['tender_externalreference']).split(r'/')[0].lower()
    
    if district_identifier in three_letter_distirct_identifiers_dict:
        idea_frm_tenders_df.loc[idx,'tender_district'] = three_letter_distirct_identifiers_dict[district_identifier]
        idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'District Identifier: '+ district_identifier

In [None]:
idea_frm_tenders_df.tender_district.dropna().shape
# 3834 tenders are already mapped to the districts. Let's check remaining tenders through sub-districts

In [None]:
#FIND THE SUB-DISTRICT PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.

## PRIORITISE LOCATION COLUMN FIRST

for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    
    tender_slug = str(row['location'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for sub_district in assam_sub_districts:
        if re.findall(r'\b%s\b'%sub_district.lower(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = assam_subdist_dict[sub_district]['district_2']
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'Sub-District: '+ str(re.findall(r'\b%s\b'%sub_district.lower().strip(), tender_slug.lower()))
            break

for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    
    tender_slug = str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for sub_district in assam_sub_districts:
        if re.findall(r'\b%s\b'%sub_district.lower(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = assam_subdist_dict[sub_district]['district_2']
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'Sub-District: '+ str(re.findall(r'\b%s\b'%sub_district.lower().strip(), tender_slug.lower()))
            break

In [None]:
idea_frm_tenders_df.tender_district.dropna().shape
# 3839 tenders are already mapped to the districts. Let's check remaining tenders through absolute blocks

In [None]:
# FIND THE BLOCK PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.

for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    tender_slug = str(row['location'])+ ' ' + str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for block in assam_blocks:
        if re.findall(r'\b%s\b'%block.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = assam_blocks_dict[block]['district_2']
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'Block: '+ str(re.findall(r'\b%s\b'%block.lower().strip(), tender_slug.lower()))
            break

In [None]:
idea_frm_tenders_df.tender_district.dropna().shape
# 3903 tenders are already mapped to the districts. Let's check remaining tenders through absolute villages

In [None]:
#FIND THE VILLAGE PRESENT IN TENDER SLUG. USE THE DICTIONARIES CREATED TO FIND THE DISTRICT.

for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district'] != None:
        continue
    print(idx)
    tender_slug = str(row['location'])+ ' ' + str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for village in assam_villages:
        if not re.search('[a-zA-Z]', village):
            continue 
        village = re.sub(r"[\[\]]?", "", village)
        if village in VILLAGE_CORRECTION_DICT:
            village = VILLAGE_CORRECTION_DICT[village]
        if re.findall(r'\b%s\b'%village.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district'] = assam_villages_dict[village]['district_2']
            idea_frm_tenders_df.loc[idx,'explain_geocode'] = 'Village: '+ str(re.findall(r'\b%s\b'%village.lower().strip(), tender_slug.lower()))
            break

In [None]:
idea_frm_tenders_df.tender_district.dropna().shape
#  tenders are already mapped to the districts.

In [None]:
# METHOD-2 WEIGHTAGE METHOD

In [None]:
# GET TENDER DISTRICT BASED ON externalReference COLUMN

idea_frm_tenders_df['tender_district_externalReference'] = None
for idx, row in idea_frm_tenders_df.iterrows():
    
    district_identifier = str(row['tender_externalreference']).split(r'/')[0].lower()
    if 'rgr' in district_identifier:
        district_identifier = district_identifier.split('rgr')[0].strip()[:-1]
    
    if district_identifier in three_letter_distirct_identifiers_dict:
        idea_frm_tenders_df.loc[idx,'tender_district_externalReference'] = three_letter_distirct_identifiers_dict[district_identifier]
        
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_externalreference'] != None:
        continue
    tender_slug = str(row['tender_externalreference'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    for district in assam_districts:
        if re.findall(r'\b%s\b'%district.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_externalReference'] = district
            break
            
## REVENUE
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_externalreference'] != None:
        continue
    
    tender_slug = str(row['tender_externalreference'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for revenue_circle in assam_revenue_circles:
        if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_externalReference'] = assam_revenue_circles_dict[revenue_circle]['district_2']
            break

            
## SUB DISTRICT
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_externalreference'] != None:
        continue
    
    tender_slug = str(row['tender_externalreference'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for sub_district in assam_sub_districts:
        if re.findall(r'\b%s\b'%sub_district.lower(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_externalReference'] = assam_subdist_dict[sub_district]['district_2']
            break

In [None]:
# GET TENDER DISTRICT BASED ON TITLE AND WORK DESCRIPTION

idea_frm_tenders_df['tender_district_title_description'] = None
for idx, row in idea_frm_tenders_df.iterrows():
    tender_slug = str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    for district in assam_districts:
        if re.findall(r'\b%s\b'%district.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_title_description'] = district
            break
            
## REVENUE
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district_title_description'] != None:
        continue
    
    tender_slug = str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for revenue_circle in assam_revenue_circles:
        if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_title_description'] = assam_revenue_circles_dict[revenue_circle]['district_2']
            break

            
## SUB DISTRICT
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district_title_description'] != None:
        continue
    
    tender_slug = str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for sub_district in assam_sub_districts:
        if re.findall(r'\b%s\b'%sub_district.lower(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_title_description'] = assam_subdist_dict[sub_district]['district_2']
            break

In [None]:
# GET TENDER DISTRICT BASED ON LOCATION COLUMN
idea_frm_tenders_df['tender_district_location'] = None
for idx, row in idea_frm_tenders_df.iterrows():
    tender_slug = str(row['location']) 
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    for district in assam_districts:
        if re.findall(r'\b%s\b'%district.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_location'] = district
            break
            
## REVENUE
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district_location'] != None:
        continue
    
    tender_slug = str(row['location'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for revenue_circle in assam_revenue_circles:
        if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_location'] = assam_revenue_circles_dict[revenue_circle]['district_2']
            break

            
## SUB DISTRICT
for idx, row in idea_frm_tenders_df.iterrows():
    if row['tender_district_location'] != None:
        continue
    
    tender_slug = str(row['location'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for sub_district in assam_sub_districts:
        if re.findall(r'\b%s\b'%sub_district.lower(), tender_slug.lower()):
            idea_frm_tenders_df.loc[idx,'tender_district_location'] = assam_subdist_dict[sub_district]['district_2']
            break

In [None]:
# BTC FLAG
idea_frm_tenders_df['BTC_flag'] = None
for idx, row in idea_frm_tenders_df.iterrows():
    BTC_flag = False
    
    #tender_slug = str(row['Tender ID']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'] + ' ' + str(row['location']))
    #tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    #skip Bodoland tenders
    department_slug = str(row["Organisation Chain"] + ' ' + row["Department"])
    department_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', department_slug)
    if re.findall(r"bodoland", department_slug.lower()):
        BTC_flag= True
    
    bodoland_dept_slugs = ["BoTC", "BTC"]
    for slug in bodoland_dept_slugs:
        if slug in row["Tender ID"]:
            BTC_flag= True

    idea_frm_tenders_df.loc[idx,'BTC_flag'] = BTC_flag
    

In [None]:
# WEIGHTAGE LOGIC
idea_frm_tenders_df = pd.read_csv('IDEA_FRM_DISTRICT_GEOTAG_ABSOLUTE_v3.csv')
idea_frm_tenders_df['tender_district_externalReference'].fillna('NA',inplace=True) 
idea_frm_tenders_df['tender_district_title_description'].fillna('NA',inplace=True) 
idea_frm_tenders_df['tender_district_location'].fillna('NA',inplace=True) 

idea_frm_tenders_df['DISTRICT_FINALISED'] = ''
for idx, row in idea_frm_tenders_df.iterrows():
    district1 = row['tender_district_externalReference']
    district2 = row['tender_district_title_description']
    district3 = row['tender_district_location']
    districts = [district1,district2,district3]
    districts = set([x for x in districts if x!='NA'])
    if len(districts)==1:
        DISTRICT_SELECTED = list(districts)[0]
    elif len(districts)==0:
        DISTRICT_SELECTED = 'NA'
    else:
        DISTRICT_SELECTED = 'CONFLICT'
    
    idea_frm_tenders_df.loc[idx,'DISTRICT_FINALISED'] = DISTRICT_SELECTED

In [None]:
idea_frm_tenders_df.to_csv('OutputData/IDEA_FRM_DISTRICT_GEOTAG_ABSOLUTE_v4.csv',index=False)

## Geo-Coding: Revenue Circles, Blocks and Villages <a class="anchor" id="revcircle"></a>

In [32]:
#TAKING INPUT DATA GROM GOOGLE SHEET DIRECTLY
idea_frm_google_sheet =google_client.open('IDEA-FRM_filtered_tenders_with_metadata_geocoded_DISTRICTS')
geocoded_districts = idea_frm_google_sheet.worksheet('title','IDEA-FRM_filtered_tenders_with_metadata_geocoded_DISTRICTS')

In [33]:
geocoded_districts_df = geocoded_districts.get_as_df()

In [38]:
#SAMPLE CODE FOR ONE DISTRICT
FOCUS_DISTRICT = "KAMRUP"

In [40]:
idea_frm_tenders_df_FOCUSDISTRICT = geocoded_districts_df[geocoded_districts_df["DISTRICT_FINALISED_V2"] == FOCUS_DISTRICT]
idea_frm_tenders_df_FOCUSDISTRICT.sample(3)

Unnamed: 0.1,Unnamed: 0,Tender ID,tender_externalreference,tender_title,Work Description,Tender Category,Tender Type,Form of Contract,Product Category,Fiscal Year,...,explain_geocode (used village and block identifier as well),tender_district_externalReference,tender_district_title_description,tender_district_location,DISTRICT_FINALISED,tender_district(Jeeno's request)\n\n[tender_title + externalRef + WorkDesc],explain_geocode(Jeeno's request),DISTRICT_FINALISED_V2,district_liz,revenuecircle_liz
1626,10479,2018_DoWR_8502_1,GW/SDRF/2017-18/II/6,IM at Mutkuchi,Immediate measures to restore L/B embankment o...,Works,Open Tender,Works,Civil Works,2018-2019,...,Rev-Circle: ['rangia'],,,KAMRUP,KAMRUP,,,KAMRUP,,
3130,25038,2020_PWD_18240_1,No. GRC/NIT/510/2018-19/ 1831 Dated.27.07.2020,KAM/GRC/SOPD(G)/ 2020-21/11.Construction of Ro...,Construction of Road from Garamsung Vithera Ch...,Works,Open Tender,Item Rate,Civil Works ��� Roads,2020-2021,...,Rev-Circle: ['hajo'],,KAMRUP,KAMRUP,KAMRUP,KAMRUP,"Rev-Circle: ['hajo', 'hajo', 'hajo']",KAMRUP,,
3557,31817,2022_PWD_25691_2,JT/SDRF/2021-22/39,Repairs and Restoration of Flood damaged Road ...,Repairs and Restoration of Flood damaged Road ...,Works,Open Tender,Works,Civil Works,2022-2023,...,"Rev-Circle: ['goroimari', 'goroimari']",,KAMRUP,,KAMRUP,KAMRUP,"Rev-Circle: ['goroimari', 'goroimari']",KAMRUP,,


In [48]:
FOCUSDIST_village_dict = {}
FOCUSDIST_block_dict = {}
FOCUSDIST_subdistrict_dict = {}
FOCUSDIST_revcircle_dict = {}
FOCUSDIST_district_dict = {}

for index,row in ASSAM_VILLAGES[ASSAM_VILLAGES.district_2==FOCUS_DISTRICT].iterrows():
    if row["VILNAM_SOI"]:
        row["VILNAM_SOI"] = re.sub(r"[\[\]]?", "", row["VILNAM_SOI"])
        if row["VILNAM_SOI"] in VILLAGE_CORRECTION_DICT:
            row["VILNAM_SOI"] = VILLAGE_CORRECTION_DICT[row["VILNAM_SOI"]]
        
        FOCUSDIST_village_dict[row["VILNAM_SOI"]] = {"village_id" : row["OBJECTID"],
                                                 "block_name" : row["block_name"],
                                                 "subdistrict" : row["sdtname_2"],
                                                 "revenuecircle": row["revenue_ci"],
                                                 "district_2" : row["district_2"]}
    
    FOCUSDIST_block_dict[row["block_name"]] = {"subdistrict" : row["sdtname_2"],
                                           "revenuecircle": row["revenue_ci"],
                                           "district_2" : row["district_2"]}
    
    FOCUSDIST_subdistrict_dict[row["sdtname_2"]] = {"district_2" : row["district_2"]} 
    FOCUSDIST_revcircle_dict[row["revenue_ci"]] = {"district_2" : row["district_2"]} 
    FOCUSDIST_district_dict[row["district"]] = True

try:
    del FOCUSDIST_village_dict['RIVER']
    del FOCUSDIST_block_dict['JORHAT']
except:
    pass

In [51]:
FOCUSDIST_villages = list(FOCUSDIST_village_dict.keys())
FOCUSDIST_blocks = list(FOCUSDIST_block_dict.keys())
FOCUSDIST_subdistricts = list(FOCUSDIST_subdistrict_dict.keys())
FOCUSDIST_revcircles = list(FOCUSDIST_revcircle_dict.keys())

In [52]:
#GEO-CODING REVENUE CIRCLES, BLOCKS, VILLAGES
for idx, row in idea_frm_tenders_df_FOCUSDISTRICT.iterrows():
    tender_villages = []
    tender_village_id = ""
    tender_block = ""
    tender_revenueci = ""
    tender_subdistrict = ""
    
    tender_slug = str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
    tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)
    
    for village in FOCUSDIST_villages:
        if not re.search('[a-zA-Z]', village):
            continue 
        village = re.sub(r"[\[\]]?", "", village)
        if village in VILLAGE_CORRECTION_DICT:
            village = VILLAGE_CORRECTION_DICT[village]
        if re.findall(r'\b%s\b'%village.lower().strip(), tender_slug.lower()):
            tender_villages.append(village)
            tender_village_id = FOCUSDIST_village_dict[village]['village_id']
            tender_block = FOCUSDIST_village_dict[village]['block_name']
            tender_revenueci = FOCUSDIST_village_dict[village]['revenuecircle']
            tender_subdistrict = FOCUSDIST_village_dict[village]['subdistrict']
    
        
    for block in FOCUSDIST_blocks:
        if re.findall(r'\b%s\b'%block.lower().strip(), tender_slug.lower()):
            tender_block = block
            tender_revenueci = FOCUSDIST_block_dict[block]['revenuecircle']
            tender_subdistrict = FOCUSDIST_block_dict[block]['subdistrict']
            break
            
    for revenue_circle in FOCUSDIST_revcircles:
        if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
            tender_revenueci = revenue_circle
            break
    
    for subdistrict in FOCUSDIST_subdistricts:
        if re.findall(r'\b%s\b'%subdistrict.lower().strip(), tender_slug.lower()):
            tender_subdistrict = subdistrict
            break
    
      
    idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_villages'] = str(tender_villages)[1:-1]
    idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_block'] = tender_block
    idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_subdistrict'] = tender_subdistrict
    idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_revenueci'] = tender_revenueci

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [53]:
idea_frm_tenders_df_FOCUSDISTRICT

Unnamed: 0.1,Unnamed: 0,Tender ID,tender_externalreference,tender_title,Work Description,Tender Category,Tender Type,Form of Contract,Product Category,Fiscal Year,...,DISTRICT_FINALISED,tender_district(Jeeno's request)\n\n[tender_title + externalRef + WorkDesc],explain_geocode(Jeeno's request),DISTRICT_FINALISED_V2,district_liz,revenuecircle_liz,tender_villages,tender_block,tender_subdistrict,tender_revenueci
18,344,2016_DoWR_1181_1,GW/SDRF/2016-17/1,IM B/dyke Badiapathar,IM B/dyke Badiapathar,Works,Open Tender,Works,Civil Works,2016-2017,...,KAMRUP,,,KAMRUP,,,,,,
263,1424,2017_DoWR_1840_1,G.W/NABARD/2016-17/2,G.W/NABARD/2016-17/2,Pro Siltation measures to protect sarulah-Barl...,Works,Open Tender,Works,Civil Works,2016-2017,...,KAMRUP,KAMRUP,Village: ['sarulah'],KAMRUP,,,'SARULAH',HAJO,Kamalpur,Hajo
302,1546,2017_DoWR_1961_1,Ghy West/SDRF/2016-17/8,I.M to save Rangia Town,Immediate measures to save Rangia Town from fl...,Works,Open Tender,Works,Civil Works,2016-2017,...,CONFLICT,KAMRUP,"Rev-Circle: ['rangia', 'rangia']",KAMRUP,,,,RANGIA,Rangia,Rangia
569,3788,2017_PWD_3540_1,T/GRC/ARMF(PBMC)/2016-17/493/,T/GRC/ARMF(PBMC)/2016-17/493/,Repairing / Reconstruction of RCC Slab Culvert...,Works,Open Tender,Item Rate,Civil Works ��� Roads,2017-2018,...,KAMRUP,KAMRUP,Rev-Circle: ['rangia'],KAMRUP,,,"'KAMALPUR', 'MUKTAPUR'",RANGIA,Rangia,Rangia
763,5212,2017_DoWR_4477_1,FREMAA/Palasbari/PGP/P1 of 2017,Bank protection works from Dakhala to Guimara ...,"Dumping of Geo-bags, earth work in bank trimmi...",Works,Open Tender,Works,Civil Works,2017-2018,...,KAMRUP,KAMRUP,"Rev-Circle: ['palasbari', 'palasbari']",KAMRUP,,,"'GUIMARA', 'DAKHALA'",RAMPUR,Palasbari,Palasbari
1607,10280,2018_DoWR_8316_1,GW/SDRF/2017-18/II/3,I M at Noona,Immediate measures to restoration of damaged s...,Works,Open Tender,Works,Civil Works,2018-2019,...,KAMRUP,KAMRUP,Village: ['niteni'],KAMRUP,,,'NITENI',RANGIA,Rangia,Koya
1625,10478,2018_DoWR_8501_1,GW/SDRF/2017-18/II/5,IM at Gathijan,Immediate measures to restore Gathijan and its...,Works,Open Tender,Works,Civil Works,2018-2019,...,KAMRUP,,,KAMRUP,,,,,,
1626,10479,2018_DoWR_8502_1,GW/SDRF/2017-18/II/6,IM at Mutkuchi,Immediate measures to restore L/B embankment o...,Works,Open Tender,Works,Civil Works,2018-2019,...,KAMRUP,,,KAMRUP,,,,,,
1646,10576,2018_DoWR_8583_1,GW/SDRF/2017-18/II/8,IM to restoration of Chamaria satra,Immediate measures to restoration of Chamaria ...,Works,Open Tender,Works,Civil Works,2018-2019,...,KAMRUP,KAMRUP,"Rev-Circle: ['chamaria', 'chamaria']",KAMRUP,,,"'JALJALI', 'CHAMARIA SATRA'",CHAMARIA,Chamaria,Chamaria
1748,10972,2018_DoWR_8846_1,DHK/SDRF/2017-18/GOROIMARI/1,IM at Goroimari Kapahua Package 1,Immediate measures to restoration of Goroimari...,Works,Open Tender,Works,Civil Works,2018-2019,...,CONFLICT,KAMRUP,"Rev-Circle: ['goroimari', 'goroimari', 'goroim...",KAMRUP,,,,GOROIMARI,Goroimari,Goroimari


In [55]:
## CODE IT FOR ALL DISTRICTS
MASTER_DFs = []
for FOCUS_DISTRICT in ASSAM_VILLAGES.district_2.unique():
    print(FOCUS_DISTRICT)
    # Create dictionary for FOCUS DISTRICTS
    FOCUSDIST_village_dict = {}
    FOCUSDIST_block_dict = {}
    FOCUSDIST_subdistrict_dict = {}
    FOCUSDIST_revcircle_dict = {}
    FOCUSDIST_district_dict = {}
    
    for index,row in ASSAM_VILLAGES[ASSAM_VILLAGES.district_2==FOCUS_DISTRICT].iterrows():
        if row["VILNAM_SOI"]:
            row["VILNAM_SOI"] = re.sub(r"[\[\]]?", "", row["VILNAM_SOI"])
            if row["VILNAM_SOI"] in VILLAGE_CORRECTION_DICT:
                row["VILNAM_SOI"] = VILLAGE_CORRECTION_DICT[row["VILNAM_SOI"]]

            FOCUSDIST_village_dict[row["VILNAM_SOI"]] = {"village_id" : row["OBJECTID"],
                                                     "block_name" : row["block_name"],
                                                     "subdistrict" : row["sdtname_2"],
                                                     "revenuecircle": row["revenue_ci"],
                                                     "district_2" : row["district_2"]}

        FOCUSDIST_block_dict[row["block_name"]] = {"subdistrict" : row["sdtname_2"],
                                               "revenuecircle": row["revenue_ci"],
                                               "district_2" : row["district_2"]}

        FOCUSDIST_subdistrict_dict[row["sdtname_2"]] = {"district_2" : row["district_2"]} 
        FOCUSDIST_revcircle_dict[row["revenue_ci"]] = {"district_2" : row["district_2"]} 
        FOCUSDIST_district_dict[row["district"]] = True

    try:
        del FOCUSDIST_village_dict['RIVER']
        del FOCUSDIST_block_dict['JORHAT']
    except:
        pass
    
    FOCUSDIST_villages = list(FOCUSDIST_village_dict.keys())
    FOCUSDIST_blocks = list(FOCUSDIST_block_dict.keys())
    FOCUSDIST_subdistricts = list(FOCUSDIST_subdistrict_dict.keys())
    FOCUSDIST_revcircles = list(FOCUSDIST_revcircle_dict.keys())
    
    ## GEO-CODE VILLAGES, BLOCKS, REVENUE-CIRCLES
    idea_frm_tenders_df_FOCUSDISTRICT = geocoded_districts_df[geocoded_districts_df["DISTRICT_FINALISED_V2"] == FOCUS_DISTRICT]
    for idx, row in idea_frm_tenders_df_FOCUSDISTRICT.iterrows():
        tender_villages = []
        tender_village_id = ""
        tender_block = ""
        tender_revenueci = ""
        tender_subdistrict = ""

        tender_slug = str(row['tender_externalreference']) + ' ' + str(row['tender_title']) + ' ' + str(row['Work Description'])
        tender_slug = re.sub('[^a-zA-Z0-9 \n\.]', ' ', tender_slug)

        for village in FOCUSDIST_villages:
            if not re.search('[a-zA-Z]', village):
                continue 
            village = re.sub(r"[\[\]]?", "", village)
            if village in VILLAGE_CORRECTION_DICT:
                village = VILLAGE_CORRECTION_DICT[village]
            if re.findall(r'\b%s\b'%village.lower().strip(), tender_slug.lower()):
                tender_villages.append(village)
                tender_village_id = FOCUSDIST_village_dict[village]['village_id']
                tender_block = FOCUSDIST_village_dict[village]['block_name']
                tender_revenueci = FOCUSDIST_village_dict[village]['revenuecircle']
                tender_subdistrict = FOCUSDIST_village_dict[village]['subdistrict']

        for block in FOCUSDIST_blocks:
            if re.findall(r'\b%s\b'%block.lower().strip(), tender_slug.lower()):
                tender_block = block
                tender_revenueci = FOCUSDIST_block_dict[block]['revenuecircle']
                tender_subdistrict = FOCUSDIST_block_dict[block]['subdistrict']
                break

        for revenue_circle in FOCUSDIST_revcircles:
            if re.findall(r'\b%s\b'%revenue_circle.lower().strip(), tender_slug.lower()):
                tender_revenueci = revenue_circle
                break

        for subdistrict in FOCUSDIST_subdistricts:
            if re.findall(r'\b%s\b'%subdistrict.lower().strip(), tender_slug.lower()):
                tender_subdistrict = subdistrict
                break


        idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_villages'] = str(tender_villages)[1:-1]
        idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_block'] = tender_block
        idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_subdistrict'] = tender_subdistrict
        idea_frm_tenders_df_FOCUSDISTRICT.loc[idx,'tender_revenueci'] = tender_revenueci
        
    MASTER_DFs.append(idea_frm_tenders_df_FOCUSDISTRICT)    

KOKRAJHAR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


DHUBRI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


GOALPARA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


BARPETA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


BAJALI
MORIGAON


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

NAGAON


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


SONITPUR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


LAKHIMPUR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


DHEMAJI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


TINSUKIA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


DIBRUGARH


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


SIVSAGAR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


JORHAT


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


GOLAGHAT


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


K.ANGLONG


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


DIMA HASAO
CACHAR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

KARIMGANJ


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


HAILAKANDI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


BONGAIGAON


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


CHIRANG


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


KAMRUP


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


KAMRUP METRO


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


NALBARI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


BAKSA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


TAMULPUR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


DARRANG


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


UDALGURI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


CHARAIDEO


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


BISWANATH


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


HOJAI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

WEST KARBI ANGLONG
SOUTH SALMARA MANCACHAR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


MAJULI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


None


In [60]:
MASTER_DFs.append(geocoded_districts_df[geocoded_districts_df["DISTRICT_FINALISED_V5"] == 'NA'])

## Identify river names in flood related tender titles.

In [None]:
import enchant
d = enchant.Dict("en_UK")

In [None]:
def extract_river_name_from_title(title):
    title = title.replace(',', ' ').replace('_', ' ').replace('(', ' ').replace(')', ' '). \
                    replace('.', ' ').lower().split()    
    try:
        #This checks if there is 'river' in the title.
        title.index('river')
        
        try:
            #This checks if there is no suffix to the word 'river' in title (river word comes at last)
            suffix = title[title.index('river')+1]
            # If suffix is a english word, then prefix is the river name.  Vice versa.
            if d.check(suffix):  #If there is no suffix, prefix has to be river name.
                river_identified = prefix
            else:
                river_identified = suffix
            
            return river_identified
        
        except:
            prefix =  title[title.index('river')-1]
            if not d.check(prefix):  #If prefix is not an English word, it has to be river name.
                river_identified = prefix
            else:
                river_identified = None
            return river_identified
        
    except:
        return None

In [None]:
tender_titles = flood_df['tender_title'].astype(str) + ' ' + flood_df['Work Description'].astype(str)
tender_titles = pd.DataFrame(tender_titles,columns=['tender_title'])

In [None]:
flood_df['river_names'] = tender_titles.tender_title.apply(extract_river_name_from_title)
flood_df.head()

In [None]:
def standardise_river_names(river_names):

    r1 = list(set(river_names).copy() - set([None]))
    r1.sort(reverse=True)
    r2 = list(set(river_names).copy() - set([None]))
    r2.sort()
    
    def remove_spelling_mistakes(r1, r2):
        for i in r1:
            scores = []
            for j in r2:
                scores.append(fuzz.ratio(i.lower(), j.lower()))
            while max(scores) == 100:
                scores[scores.index(100)] = -1

            if max(scores) >= 80:
                change = r2[scores.index(max(scores))]
                to_delete = r1[r1.index(i)]
                r1[r1.index(i)] = change
                r2[r2.index(to_delete)] = change
        return r1, r2
    
    
    k = True
    elbow = []
    while k:
        r1, r2 = remove_spelling_mistakes(r1, r2)
        elbow.append(len(set(r2)))
        if (len(elbow) >= 2):
            if (elbow[-1] == elbow[-2]):
                k = False
    # Elbow reached in two iterations.

    # Manually removing remaining bad elements
    river_names_std = set(r1) - set(['Bank', 'Training', 'Erosion', 'Front', 'Course', 'District','EROSION','BANK','TRAINING','Side',
                                     'River', 'Embankment','Aesthetic','Construction','Western','Recoupment','Protection',
                                     'MPWSS','Guwahati','Group','Emabnkment','None','b/b','l/b','nan','nala','pk1','pk2'])
    return river_names_std

In [None]:
assam_rivers  = pd.DataFrame(standardise_river_names(flood_df.river_names.astype(str)))
assam_rivers.columns=['river_name']
assam_rivers.to_csv('assam_rivers.csv',index=False)
# Manually check for any discrepancies. 

In [None]:
assam_rivers = pd.read_csv('assam_rivers.csv')
assam_rivers

In [None]:
river_match_dict = dict()

#river_match_df = pd.read_csv("river_match.csv")
#river_match_dict = river_match_df.set_index('messy_name').to_dict()['original_name']

def match_rivers(query):
    if query in river_match_dict.keys():
        return river_match_dict[query]
    assam_rivers['key'] = query.lower()
    assam_rivers['key'] = assam_rivers['key'].astype('str')
    #print(assam_rivers.river_name[assam_rivers.apply(similar, axis=1).argmax()])
    if assam_rivers.apply(similar, axis=1).max() > 0.77:
        river_match_dict[query] = assam_rivers.river_name[assam_rivers.apply(similar, axis=1).argmax()]
        return assam_rivers.river_name[assam_rivers.apply(similar, axis=1).argmax()]
    else:
        river_match_dict[query] = None
        return None

In [None]:
flood_df['std_name'] = flood_df.river_names.astype(str).apply(match_rivers)

In [None]:
flood_df.std_name.shape[0]-flood_df.std_name.replace('None',np.nan).replace('nan',np.nan).isnull().sum()

In [None]:
flood_df

In [None]:
river_match_df = pd.DataFrame.from_dict(river_match_dict,orient='index').reset_index()

river_match_df.columns=['messy_name','original_name']
river_match_df.to_csv('river_match.csv',index=False)