In [19]:
import pandas as pd
import xlrd
import arcpy
from pandas import ExcelWriter
from datetime import datetime
import os

In [20]:
now = datetime.now().strftime('%Y%m%d%H%M%S')
temp_path = r'C:\temp\data_processing_exports'
questionnaire_file = r'C:\git\hh_survey\R3 questionnaire_GeoPoll_final.xlsx'
coded_values_file = os.path.join(temp_path, "coded_values_%s.xlsx" % now)
writer = pd.ExcelWriter(coded_values_file, engine='xlsxwriter')
field_names_list = []
max_counter = 3000 #for testing purposes, we may need to limit the execution only to some items

In [21]:
def importallsheets(in_excel, out_gdb):
    ###this function import all sheets of an xlsx file into a separate table inside a GDB (with same name)
    print("max_counter: %s" % max_counter)
    counter = 0
    workbook = xlrd.open_workbook(in_excel)
    sheets = [sheet.name for sheet in workbook.sheets()]

    print('{} sheets found: {}'.format(len(sheets), ','.join(sheets)))
    for sheet in sheets:
        counter +=1
        if counter <= max_counter:
            # The out_table is based on the input excel file name
            # a underscore (_) separator followed by the sheet name
            out_table = os.path.join(
                out_gdb,
                arcpy.ValidateTableName(
                    "{0}".format(sheet),
                    out_gdb))

            print('Converting {} to {}'.format(sheet, out_table))

            # Perform the conversion
            arcpy.ExcelToTable_conversion(in_excel, out_table, sheet)
            

def make_attribute_dict(fc, code_field, value_field):
    #this function creates a dictionary based on a GDB table
    attdict = {}
    with arcpy.da.SearchCursor(fc, [code_field, value_field]) as cursor:
        for row in cursor:
            attdict[row[0]] = row[1]
    return attdict

def fix_category_formatting(category):
     return category.replace("[","(").replace("]",")").replace("(specify)","").replace("/ ",", ").capitalize().replace("adps","ADPs").replace("idp","IDP").replace("covid","COVID")


In [22]:
##this section of the script reads the survey excel file and creates an excel file with multiple sheets:
### each sheet contains the coded value and description for a "Single choice" or "Open Ended-Select All That Apply" question.

print("Opening questionnaire DF")
quest_df = pd.read_excel(open(questionnaire_file, 'rb'), sheet_name='Questionnaire HH',skiprows=2)
#create a list of all possible numbering
numbering = ["%s)" % n for n in range(1,200)] ## 1), 2), ... 200)
# initialize list of lists that will store the results
dict_derived_fieldnames = {} #this dict will group all derived fields in case of "Select All That Apply" type of questions
##iterate the following for each row (so each question of the questionnaire)
for index, row in quest_df.iterrows():
    try:
        first_derived_fieldname = ""
        all_derived_fieldnames = []
        codes_and_labels = []
        categories = str(row['English']).replace("\t","")
        question_name = row['Suggested Qname']  #Q Name
        skip_pattern = row['Skip Pattern']
        question_type = row['Q Type']
        programming_instructions = row['Programming Instructions']
        print("\n\n----%s----" % question_name)
        #only for questions with pre-defined categories
        if question_type in ("StartRecording","Single Choice","Open Ended-Single Choice", "Open Ended-Select All That Apply","Select All That Apply"):
            if question_name == 'crp_main':
                programming_lines = programming_instructions.splitlines()
                for programming_line in programming_lines:
                    if ")" in programming_line:
                        print(programming_line)
                        index, category = programming_line.split(")")
                        category = fix_category_formatting(category)
                        codes_and_labels.append([index, category])
            else:
                #find all numbering present in the category string
                numbering_in_text = [n for n in numbering if n in categories]
                print(numbering_in_text)
                ##the following loop creates a list "codes_and_labels" with all available codes&labels for each question
                for index in range(0,len(numbering_in_text)):
                    start = categories.find(numbering_in_text[index]) + len(numbering_in_text[index])
                    try:
                        end = categories.find(numbering_in_text[index + 1])
                        substring = categories[start:end].strip()
                    except:
                        # it fails during the last loop -> the last option is usually at the end of the string
                        substring = categories[start:].strip()
                    print(substring)
                    category = fix_category_formatting(substring)
                    codes_and_labels.append([index +1, category])
                
            if question_type not in ["Open Ended-Select All That Apply","Select All That Apply"]:
                field_names_list.append(question_name)
                codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                codes_and_labels_df.to_excel(writer, sheet_name=question_name)
            else:
                numbering_in_qname = [n for n in numbering if n in question_name]
                for index in range(0, len(numbering_in_qname)):
                    start = question_name.find(numbering_in_qname[index]) + len(numbering_in_qname[index])
                    try:
                        end = question_name.find(numbering_in_qname[index + 1])
                        derived_field_name = question_name[start:end].strip()
                    except:
                        # it fails during the last loop -> the last option is usually at the end of the string
                        derived_field_name = question_name[start:].strip()
                    if index == 0:
                        first_derived_fieldname = derived_field_name
                        codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                        codes_and_labels_df.to_excel(writer, sheet_name=derived_field_name)
                    all_derived_fieldnames.append(derived_field_name)
                    field_names_list.append(derived_field_name)
                dict_derived_fieldnames[first_derived_fieldname] = all_derived_fieldnames
    except:
        pass
        print("failed for some reasons")


print("Saving codes and labels %s" % coded_values_file)
# Close the Pandas Excel writer and output the Excel file.
writer.save()

Opening questionnaire DF


----nan----


----nan----


----calldispo----
['1)', '2)', '3)', '4)', '5)', '6)', '7)']
Someone answers
Answering machine
No answer
Hang up or refusal
Call back
Under review
Disconnected


----calldispo_answeringmachine----


----calldispo_noanswer----


----calldispo_underreview----


----calldispo_disconnected----


----resp_language----
['1)', '2)']
English
other options [add as many as necessary]


----introduction----
['1)']
CONTINUE


----resp_agree----
['1)', '2)', '3)']
Yes
Not now but another time in the week
No


----resp_refusalwhy----


----resp_whencallback----


----callbackmessage_en----


----nan----


----NA ----


----nan----


----resp_age----


----resp_gender----
['1)', '2)', '3)', '4)']
Male
Female
DON'T KNOW
REFUSED


----hh_admin1----
['1)', '2)', '3)', '4)']
…
…
DON'T KNOW
REFUSED


----hh_admin2_1----
['1)', '2)', '3)', '4)']
…
…
DON'T KNOW
REFUSED


----hh_admin2_2----
['1)', '2)', '3)', '4)']
…
…
DON'T KNOW
REFUSED


----quotareac

['1)', '2)', '3)', '4)', '5)', '6)', '7)', '8)', '9)', '10)', '11)', '12)', '13)', '14)', '15)', '16)', '17)', '18)', '19)', '20)', '21)', '22)', '23)', '24)', '25)', '26)', '27)']
No shock
Sickness or death of household member[s] [sickness of breadwinner / unusually high medical expenditure / funeral expenses / death of breadwinner]
Lost employment or working opportunities
Other intra household shock
Much higher than usual food prices
Much higher than usual fuel prices
External event impeding the continuation of work or business affecting all - laws closing markets / businesses / road infrastructure to markets destroyed/ etc.
Other economic shock
Pest outbreak
Plant disease
Animal disease affecting many animals
Lack of physical access to pasture [because of conflict / infrastructure damage / COVID-19 restrictions]
Other crop and livestock shock
Cold temperatures or hail
Flood
Hurricane / cyclone
Drought
Earthquake
Landslides
Fire from natural disaster
Other natural hazard
Violence and

Same
Less
A lot less [less than half as much]
DON'T KNOW
REFUSED


----nan----


----fies----
['1)']
NEXT


----fies_worried----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_healthy----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_fewfoods----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_skipped----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_ateless----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_ranout----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_ranout_hhs----
['1)', '2)', '3)', '4)', '5)']
Rarely [once or twice]
Sometimes [in some weeks but not every week]
Often [every week]
DON'T KNOW
REFUSED


----fies_hungry----
['1)', '2)', '3)', '4)']
YES
NO
DON'T KNOW
REFUSED


----fies_hungry_hhs----
['1)', '2)', '3)', '4)', '5)']
Rarely [once or twice]
Sometimes [in some weeks but not every week]
Often [every week]
DON'T KNOW
REFUSED


----fies_whlday----
['1)', '2)', '3)', '4)']
YES


In [23]:
###this section creates a new GDB, and import each table with coded values and description.
###it can take up to 5 seconds per table - and we normally process hundreds of table.
####for testing purposes edit variable max_counter in the first cell of this notebook

gdb_name = "fGDB_with_coded_values_%s.gdb" % now
output_gdb = os.path.join(temp_path, gdb_name)
arcpy.CreateFileGDB_management(temp_path,gdb_name)
importallsheets(coded_values_file, output_gdb)

max_counter: 3000
72 sheets found: calldispo,resp_language,introduction,resp_agree,resp_gender,hh_admin1,hh_admin2_1,hh_admin2_2,quotareached,hh_agricactivity,hh_gender,hh_education,hh_wealth,hh_residencetype,hh_maritalstat,income_main,income_main_amount_conf,income_main_comp,income_sec,income_sec_amount_conf,income_sec_comp,income_third,income_third_amount_conf,income_third_comp,shock_noshock,crp_main,crp_landsize,crp_area_change,crp_harv_change,crp_proddif,crp_salesdif,crp_salesprice,crp_proc,crp_proc_state,ls_proddif,ls_salesmain,ls_salesdif,ls_salesprice,ls_proc,ls_proc_fac_state,fish_main_coastal,fish_change,fish_proddif,fish_salesdif_1,fish_salesprice,fies,fies_worried,fies_healthy,fies_fewfoods,fies_skipped,fies_ateless,fies_ranout,fies_ranout_hhs,fies_hungry,fies_hungry_hhs,fies_whlday,fies_whlday_hhs,copingstrategies,cs_spentsavings,cs_borrowmoney,cs_purchasedfoodcredit,cs_soldhhassetsgoods,cs_soldprodassets,cs_eatplantingseeds,cs_withdrewchild,cs_soldlandhouse,cs_begging,cs_s

Converting cs_soldhhassetsgoods to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_soldhhassetsgoods
Converting cs_soldprodassets to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_soldprodassets
Converting cs_eatplantingseeds to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_eatplantingseeds
Converting cs_withdrewchild to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_withdrewchild
Converting cs_soldlandhouse to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_soldlandhouse
Converting cs_begging to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_begging
Converting cs_soldfemanimals to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\cs_soldfemanimals
Converting hdds_confirmation to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\hdds_confirmation
Converting need 

In [24]:
###this section is temporary, to be used untill we won't have the final table structure from geopoll
###it reads the survey file and creates an excel file with all field names of the outputs table.
###then it imports the empty output table template to the GDB, and calls it survey_data
###coded values will be enforced there.

survey_empty_table_df = pd.DataFrame(columns=[field_names_list])
survey_empty_table_xlsx = os.path.join(temp_path, "survey_empty_table_%s.xlsx" % now)
writer = pd.ExcelWriter(survey_empty_table_xlsx, engine='xlsxwriter')
survey_empty_table_df.to_excel(writer, sheet_name="survey_data")
writer.save()

importallsheets(survey_empty_table_xlsx, output_gdb)

max_counter: 3000
1 sheets found: survey_data
Converting survey_data to C:\temp\data_processing_exports\fGDB_with_coded_values_20210430160428.gdb\survey_data


In [48]:
###in order to enforce coded values, all fields shuld be integer. This section alters each fields type in the survey master table

GDB_survey_table = os.path.join(output_gdb,"survey_data")
field_names = [f.name for f in arcpy.ListFields(GDB_survey_table)]
for field in field_names:
    if field in field_names_list:
        print("changing type for field %s" % field)
        try:
            if field == 'crp_main':
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "TEXT")
            else:
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "SHORT")
        except:
            pass

changing type for field calldispo
changing type for field resp_language
changing type for field introduction
changing type for field resp_agree
changing type for field resp_gender
changing type for field hh_admin1
changing type for field hh_admin2_1
changing type for field hh_admin2_2
changing type for field quotareached
changing type for field hh_agricactivity
changing type for field hh_gender
changing type for field hh_education
changing type for field hh_wealth
changing type for field hh_residencetype
changing type for field hh_maritalstat
changing type for field income_main
changing type for field income_main_amount_conf
changing type for field income_main_comp
changing type for field income_sec
changing type for field income_sec_amount_conf
changing type for field income_sec_comp
changing type for field income_third
changing type for field income_third_amount_conf
changing type for field income_third_comp
changing type for field shock_noshock
changing type for field shock_sickness

In [52]:
def create_and_apply_domain(master_table, domain_table, field_name, domain_already_exists):
    # Process: Create the coded value domain
    domName = '%s_dom' % domain_table
    if not domain_already_exists:
        if domain_table == 'crp_main':
            field_type = "TEXT"
        else:
            field_type = "SHORT"
        try:
            arcpy.CreateDomain_management(output_gdb, domName, field_type= field_type)
        except:
            print('domain exists')
        # Store all the domain values in a dictionary with the domain code as the "key" 
        # and the domain description as the "value" (domDict[code])

        domDict = make_attribute_dict(domain_table, 'code', 'label')
        print(domDict)

        #   dict example:  {"CI":"Cast iron", "DI": "Ductile iron", "PVC": "PVC", 
        #                "ACP": "Asbestos concrete", "COP": "Copper"}

        # Process: Add valid material types to the domain
        # use a for loop to cycle through all the domain codes in the dictionary
        for code in domDict:
            try:
                arcpy.AddCodedValueToDomain_management(output_gdb, domName, code, domDict[code])
            except:
                print("AddCodedValueToDomain_management failed")

        # Process: Constrain the material value of distribution mains

    arcpy.AssignDomainToField_management(master_table, field_name, domName) 

In [55]:
##THis section of the script searches for all domains tables in the GDB
##then, for each table  it creates a new domain,
###it stores all domain values in a dictionary, it add each code domain, and finally assign the domain to the right field (having same name)

# Set the current workspace
arcpy.env.workspace = output_gdb

# Get and print a list of tables
tables = arcpy.ListTables()
master_table = os.path.join(output_gdb,'survey_data')
for table in tables:
    if 'survey_data' not in table:
        #print(table)
        print ("Creating and Applying (new) domain for %s" % table)
        create_and_apply_domain(master_table, table, table, False) #domain_table has the same name of the field where it should be applied
        if table in dict_derived_fieldnames:
            derived_fields_list = dict_derived_fieldnames[table]
            for derived_field in derived_fields_list:
                print ("Creating and Applying (existing) domain for derivate field %s" % derived_field)
                create_and_apply_domain(master_table, table, derived_field, True)


        

Creating and Applying (new) domain for calldispo
domain exists
{1: 'Someone answers', 2: 'Answering machine', 3: 'No answer', 4: 'Hang up or refusal', 5: 'Call back', 6: 'Under review', 7: 'Disconnected'}
Creating and Applying (new) domain for resp_language
domain exists
{1: 'English', 2: 'Other options (add as many as necessary)'}
Creating and Applying (new) domain for introduction
domain exists
{1: 'Continue'}
Creating and Applying (new) domain for resp_agree
domain exists
{1: 'Yes', 2: 'Not now but another time in the week', 3: 'No'}
Creating and Applying (new) domain for resp_gender
domain exists
{1: 'Male', 2: 'Female', 3: "Don't know", 4: 'Refused'}
Creating and Applying (new) domain for hh_admin1
domain exists
{1: '…', 2: '…', 3: "Don't know", 4: 'Refused'}
Creating and Applying (new) domain for hh_admin2_1
domain exists
{1: '…', 2: '…', 3: "Don't know", 4: 'Refused'}
Creating and Applying (new) domain for hh_admin2_2
domain exists
{1: '…', 2: '…', 3: "Don't know", 4: 'Refused'}

{1: 'No shock', 2: 'Sickness or death of household member(s) (sickness of breadwinner , unusually high medical expenditure , funeral expenses , death of breadwinner)', 3: 'Lost employment or working opportunities', 4: 'Other intra household shock', 5: 'Much higher than usual food prices', 6: 'Much higher than usual fuel prices', 7: 'External event impeding the continuation of work or business affecting all - laws closing markets , businesses , road infrastructure to markets destroyed, etc.', 8: 'Other economic shock', 9: 'Pest outbreak', 10: 'Plant disease', 11: 'Animal disease affecting many animals', 12: 'Lack of physical access to pasture (because of conflict , infrastructure damage , COVID-19 restrictions)', 13: 'Other crop and livestock shock', 14: 'Cold temperatures or hail', 15: 'Flood', 16: 'Hurricane , cyclone', 17: 'Drought', 18: 'Earthquake', 19: 'Landslides', 20: 'Fire from natural disaster', 21: 'Other natural hazard', 22: 'Violence and insecurity , conflict', 23: 'Theft o

Creating and Applying (new) domain for fies_hungry_hhs
{1: 'Rarely (once or twice)', 2: 'Sometimes (in some weeks but not every week)', 3: 'Often (every week)', 4: "Don't know", 5: 'Refused'}
Creating and Applying (new) domain for fies_whlday
{1: 'Yes', 2: 'No', 3: "Don't know", 4: 'Refused'}
Creating and Applying (new) domain for fies_whlday_hhs
{1: 'Rarely (once or twice)', 2: 'Sometimes (in some weeks but not every week)', 3: 'Often (every week)', 4: "Don't know", 5: 'Refused'}
Creating and Applying (new) domain for copingstrategies
{1: 'Next'}
Creating and Applying (new) domain for cs_spentsavings
{1: 'Yes', 2: 'No - because it wasn’t necessary', 3: 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it', 4: 'Not applicable', 5: "Don't know", 6: 'Refused'}
Creating and Applying (new) domain for cs_borrowmoney
{1: 'Yes', 2: 'No - because it wasn’t necessary', 3: 'No - because you already sold those assets or did th