In [1]:
import pandas as pd
import xlrd, re
import arcpy
from pandas import ExcelWriter
from datetime import datetime
import os

In [2]:
startTime = datetime.now()
now = datetime.now().strftime('%Y%m%d%H%M%S')
temp_path = r'C:\temp\data_processing_exports'
#questionnaire_file = r'C:\git\hh_survey\household_questionnaire_geopoll_202105.xlsx' #the questionnaire file that we use for creating the table
questionnaire_file = r'C:\git\hh_survey\household_questionnaire_geopoll_EN_template_20210720_ISO3.xlsx'
coded_values_file = os.path.join(temp_path, "coded_values_%s.xlsx" % now) #intermediary output file with all categories and codes extracted from the questionnaire
writer = pd.ExcelWriter(coded_values_file, engine='xlsxwriter')
field_names_list = []
max_counter = 3000 #for testing purposes, we may need to limit the execution only to some items

In [3]:
def importallsheets(in_excel, out_gdb):
    ###this function import all sheets of an xlsx file into separate tables inside a GDB (with same name)
    print("max_counter: %s" % max_counter)
    counter = 0
    workbook = xlrd.open_workbook(in_excel)
    sheets = [sheet.name for sheet in workbook.sheets()]

    print('{} sheets found: {}'.format(len(sheets), ','.join(sheets)))
    for sheet in sheets:
        counter +=1
        if counter <= max_counter:
            # The out_table is based on the input excel file name
            # a underscore (_) separator followed by the sheet name
            out_table = os.path.join(
                out_gdb,
                arcpy.ValidateTableName(
                    "{0}".format(sheet),
                    out_gdb))

            print('Converting {} to {}'.format(sheet, out_table))

            # Perform the conversion
            arcpy.ExcelToTable_conversion(in_excel, out_table, sheet)
            

def make_attribute_dict(fc, code_field, value_field):
    #this function creates a dictionary based on a GDB table
    attdict = {}
    with arcpy.da.SearchCursor(fc, [code_field, value_field]) as cursor:
        for row in cursor:
            attdict[row[0]] = row[1]
    return attdict

def fix_category_formatting(category):
    #this function improves and standardizes the formatting of the categories' descriptions
    category = re.sub('--[^--]+--', '', category)
    return category.replace("[","(").replace("]",")").replace("(specify)","").replace("/ ",", ").capitalize().replace("adps","ADPs").replace("idp","IDP").replace("covid","COVID").replace(" , ",", ").replace("staplec","staple") 

def insert_element_in_list_after_element(old_list, new_item,after_item): 
    new_list = []
    found = False
    for each in old_list:
        new_list.append(each)
        if each == after_item:
            found = True
            new_list.append(new_item)
    if found == False:
        print('WARNING ------------------------------------ELEMENT %s NOT FOUND------ITEM %s NOT ADDED !!' % (after_item, new_item))
    return new_list

In [4]:
language_domains = [[1,'English'],[2,'French'],[3,'Spanish'],[4,'Arabic'],[5,'Portuguese'],[6,'Dari'],[7,'Pashto'],[8,'Sangho'],[9,'Shona'],[10,'Ndebele'],[11,'Swahili'],[12,'Tshiluba'],[13,'Lingala'],[14,'Kikongo'],[15,'Fula'],[16,'Songhai'],[17,'Dogon'],[18,'Hausa'],[19,'Zarma'],[20,'Kanouri'],[21,'Somali'],[22,'Xichangana'],[23,'Xichope'],[24,'Ndau'],[25,'Xicena'],[26,'Machuabo'],[27,'Chichewa'],[28,'Lomwe'],[29,'Macua'],[30,'Makonde']]

In [8]:
##this section of the script reads the survey excel file and creates an excel file with multiple sheets:
### each sheet contains the coded value and description for a "Single choice" or "Open Ended-Select All That Apply" question.
###moreover, it creates and populates several lists that will be used later for defining each field of the final table names, types and domains

print("Opening questionnaire DF")
quest_df = pd.read_excel(open(questionnaire_file, 'rb'), sheet_name='survey',skiprows=2)
#create a list of all possible numbering
numbering = ["%s)" % n for n in range(1,200)] ## 1), 2), ... 200)
# initialize list of lists that will store the results
dict_derived_fieldnames = {} #this dict will group all derived fields in case of "Select All That Apply" type of questions
field_names_list = [] ##this list will contain all fields of the final table
text_type_fields = [] ##this list will contain all fields of the final table with TEXT type
range_type_fields = [] ##this list will contain all fields of the final table storing RANGE data  (will be LONG type)
double_type_fields = [] ##this list will contain all fields of the final table storing DOUBLE data  
##iterate the following for each row (so each question of the questionnaire)
all_derived_fieldnames = []
all_answers_with_other_option = [] ##this list will contain all "Other: specify" fields in Select all that apply type of questions
single_choice_questions_with_other_categories = [] ##this list will contain all "Other: specify" fields in single choice type of questions
quest_df = quest_df[quest_df['Suggested Qname'].notna()]
yes_no_reclassified_list = []
for index, row in quest_df.iterrows():
    try:
        first_derived_fieldname = "" #the name of the first derived field will be the main of the domain table
        derived_fieldnames = []
        codes_and_labels = []
        categories = str(row['English']).replace("\t","")
        question_name = row['Suggested Qname'].strip()  #Q Name
        
        #renaming some fields
        if question_name == 'language2':
            question_name = 'language'

        
        question_type = row['Q Type']
        programming_instructions = row['Programming Instructions'] #this field contains coded values for crop_main
        print("\n\n----%s----" % question_name)
        #only for questions with pre-defined categories need domains
        if question_type in ("StartRecording","Single Choice","Open Ended-Single Choice", "Open Ended - Single Choice", "Open Ended-Select All That Apply",
                             "Select All That Apply","Open Ended - Select All That Apply "):
            if question_name == 'crp_main': #for this question only, coded values should be taken from field programming_instructions
                programming_lines = programming_instructions.splitlines()
                for programming_line in programming_lines:
                    if ")" in programming_line:
                        #print(programming_line)
                        index, category = programming_line.split(")")
                        category = fix_category_formatting(category)
                        codes_and_labels.append([int(float(index.replace(",","."))*1000), category])
                crp_final = codes_and_labels
            else:
                #find all numbering present in the category string
                numbering_in_text = [n for n in numbering if n in categories]
                #print(numbering_in_text)
                ##the following loop creates a list "codes_and_labels" with all available codes&labels for each question
                for index in range(0,len(numbering_in_text)):
                    start = categories.find(numbering_in_text[index]) + len(numbering_in_text[index])
                    try:
                        end = categories.find(numbering_in_text[index + 1])
                        substring = categories[start:end].strip()
                    except:
                        # it fails during the last loop -> the last option is usually at the end of the string
                        substring = categories[start:].strip()
                    #print(substring)
                    category = fix_category_formatting(substring)
                    codes_and_labels.append([index +1, category])
                print("codes_and_labels: ", codes_and_labels)
                if codes_and_labels == [[1, 'Yes'], [2, 'No'], [3, "Don't know"], [4, 'Refused']]:
                    #in order to accomodate Amandine request, Yes No questions need to be reclassified
                    yes_no_reclassified_list.append(question_name)
                    codes_and_labels = [[0, 'No'], [1, 'Yes'], [2, "Don't know"], [3, 'Refused']]
                if question_name == 'language': #domains for languages are not included in the questionnaire
                    codes_and_labels = language_domains
                
            if question_type not in ["Open Ended-Select All That Apply","Select All That Apply","Open Ended - Select All That Apply "]:
                #so questions with NO derived fields
                field_names_list.append(question_name.strip())
                codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                codes_and_labels_df.to_excel(writer, sheet_name=question_name)
                ##check if one of the option is other -> the question need 1 additional field for keeping the specified text
                if any(s[1].lower().strip() == "other" for s in codes_and_labels):
                    other_categories = [s[1] for s in codes_and_labels if s[1].lower().strip() == "other" ]
                    single_choice_derived_other_field = "%s_otherspecify" % question_name.strip()
                    single_choice_questions_with_other_categories.append(single_choice_derived_other_field)
                    field_names_list.append(single_choice_derived_other_field)
                    
            else:
                #so questions with derived fields
                numbering_in_qname = [n for n in numbering if n in question_name]
                for index in range(0, len(numbering_in_qname)):
                    start = question_name.find(numbering_in_qname[index]) + len(numbering_in_qname[index])
                    try:
                        end = question_name.find(numbering_in_qname[index + 1])
                        derived_field_name = question_name[start:end].strip()
                    except:
                        # it fails during the last loop -> the last option is usually at the end of the string
                        derived_field_name = question_name[start:].strip()
                    all_derived_fieldnames.append(derived_field_name)
                    derived_fieldnames.append(derived_field_name)
                    field_names_list.append(derived_field_name)
                    if derived_field_name[-6:] == "_other": ##this field will need to be STRING - with no domain (since it's a 'other specify')
                        all_answers_with_other_option.append(derived_field_name)
                    if index == 0:
                        first_derived_fieldname = derived_field_name
                        codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                        #codes_and_labels_df.to_excel(writer, sheet_name=derived_field_name) #we don't need domain table for derived fields, since they will use YES NOT domain table
                dict_derived_fieldnames[first_derived_fieldname] = all_derived_fieldnames
        elif question_type == "Range":
            #these questions will be associated to LONG type fields
            field_names_list.append(question_name.strip())
            range_type_fields.append(question_name)
        elif question_type == "Open Ended":
            #these questions will be associated to TEXT type fields
            if not pd.isnull(question_name): #NaN rows we want to skip (i.e. OptIn question without a name in the survey)
                field_names_list.append(question_name)
                text_type_fields.append(question_name)
        else:
            print("QUESTION SKIPPED ---------", question_name, question_type)
    except:
        print("failed for some reasons")

#adding Yes No table (for derived fields domain)
d = {1:"Yes",0:"No"}
yesno_df = pd.DataFrame(d.items(), columns=['code', 'label'])
yesno_df.to_excel(writer, sheet_name='yes_no')
print("Saving codes and labels %s" % coded_values_file)

#print(all_answers_with_other_option)

#creating additional sheet with all derived fields, that will be used for a script that ensures that also these values are within the domains in the output table, in a later stage
list_of_yes_no_fields = []
for derived_field in all_derived_fieldnames:
    if derived_field not in all_answers_with_other_option:
        list_of_yes_no_fields.append(derived_field)
        
derived_fields_df = pd.DataFrame(list_of_yes_no_fields) 
#print(derived_fields_df.head())
derived_fields_df.to_excel(writer, sheet_name='derived_fields') 

# Close the Pandas Excel writer and output the Excel file.
writer.save()

Opening questionnaire DF


----calldispo----
codes_and_labels:  [[1, 'Someone answers'], [2, 'Answering machine'], [3, 'No answer'], [4, 'Hang up or refusal'], [5, 'Call back'], [6, 'Under review'], [7, 'Disconnected']]


----calldispo_answeringmachine----
QUESTION SKIPPED --------- calldispo_answeringmachine nan


----calldispo_noanswer----
QUESTION SKIPPED --------- calldispo_noanswer nan


----calldispo_underreview----
QUESTION SKIPPED --------- calldispo_underreview nan


----calldispo_disconnected----
QUESTION SKIPPED --------- calldispo_disconnected nan


----resp_language----
codes_and_labels:  [[1, 'English'], [2, 'Other options (add as many as necessary)']]


----introduction----
codes_and_labels:  [[1, 'Continue']]


----resp_agree----
codes_and_labels:  [[1, 'Yes'], [2, 'Not now but another time in the week'], [3, 'No']]


----resp_refusalwhy----
codes_and_labels:  [[1, 'Not interested'], [2, 'Do not want to be recorded'], [3, 'Other ']]


----resp_whencallback----


----cal

codes_and_labels:  [[1, 'Farmer, production and sale of staple crops'], [2, 'Farmer, production and sale of vegetable or fruit'], [3, 'Farmer, production and sale of cashcrops'], [4, 'Farmer, production and sale of livestock and livestock products'], [5, 'Farmer, production and sale of fish'], [6, 'Collection and sale of forestry or bush products'], [7, 'Informal agricultural trade excluding producers'], [8, 'Formal agricultural trade excluding producers'], [9, 'Daily wage on farms and other casual employment in agricultural sector'], [10, 'Stable employment in agricultural sector'], [11, 'Non-agricultural self-employed or liberal profession, doctor, architect, lawyer, including restaurant'], [12, 'Off-farm daily wages and other non-agricultural casual employment'], [13, 'Stable employment in non-agricultural sector'], [14, 'Public employment'], [15, 'Income not derived from work, charity'], [16, 'Income not derived from work, welfare transfer, pension, humanitarian aid'], [17, 'Income

8) crp_saledif_ref----
codes_and_labels:  [[1, 'Higher marketing costs (such as transportation)'], [2, 'Damage and losses due to delay or inability to physically access markets'], [3, 'Usual traders or local customers are not buying as much as usual'], [4, 'Prices are too low'], [5, 'Difficulty processing product (lack of processing inputs, equipment, etc)'], [6, 'Other '], [7, "Don't know"], [8, 'Refused']]


----crp_salesprice----
codes_and_labels:  [[1, 'A lot more (increased by more than 50%)'], [2, 'More'], [3, 'Same'], [4, 'Less'], [5, 'A lot less (less than half as much)'], [6, "Don't know"], [7, 'Refused']]


----crp_proc----
codes_and_labels:  [[1, 'Yes'], [2, 'No'], [3, "Don't know"], [4, 'Refused']]


----crp_proc_mach_

1) crp_proc_mach_mill
2) crp_proc_mach_thresher
3) crp_proc_mach_dryingfacilit
4)crp_proc_mach_none 
5) crp_proc_mach_other
6) crp_proc_mach_dk
7) crp_proc_mach_ref----
codes_and_labels:  [[1, 'Mill'], [2, 'Thresher'], [3, 'Drying facilities'], [4, 'None'], 

codes_and_labels:  [[1, 'Fish is more difficult to find than in previous years  in the same season'], [2, 'Concerns and restrictions related to COVID-19 pandemic'], [3, 'Difficulty accessing fuel (prices higher or not available)'], [4, 'Difficulty accessing fishing material and-or other inputs (prices higher or not available)'], [5, 'Labour too expensive or income insufficient to hire labour'], [6, 'Other '], [7, "Don't know"], [8, 'Refused']]


----fish_inputdif_

1) fish_inputdif_bait
2)fish_inputdif_net
3) fish_inputdif_gear
4) fish_inputdif_ice
5) fish_inputdif_fuel
6) fish_inputdif_boatrepairs
7) fish_inputdif_other 
8)fish_inputdif_dk
9) fish_inputdif_ref----
codes_and_labels:  [[1, 'Bait'], [2, 'Net'], [3, 'Gear'], [4, 'Ice'], [5, 'Fuel'], [6, 'Boat repairs'], [7, 'Other '], [8, "Don't know"], [9, 'Refused']]


----fish_salesmain----
codes_and_labels:  [[1, 'Fresh fish (any kind)'], [2, 'Dry fish (any kind)'], [3, 'Smoked fish'], [4, 'Seafood (any kind)'], [5, 'Other '], [6, "Do

codes_and_labels:  [[1, 'Yes'], [2, 'No - because it wasn’t necessary'], [3, 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it'], [4, 'Not applicable'], [5, "Don't know"], [6, 'Refused']]


----cs_emergency_illegal----
codes_and_labels:  [[1, 'Yes'], [2, 'No - because it wasn’t necessary'], [3, 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it'], [4, 'Not applicable'], [5, "Don't know"], [6, 'Refused']]


----cs_emergency_sold_last_female----
codes_and_labels:  [[1, 'Yes'], [2, 'No - because it wasn’t necessary'], [3, 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it'], [4, 'Not applicable'], [5, "Don't know"], [6, 'Refused']]


----cs_emergency_hh_migration----
codes_and_labels:  [[1, 'Yes'], [2, 'No - because it wasn’t necessary'], [3, 'No - because you already sold

  warn("Calling close() on already closed file.")


In [7]:
for i in crp_final:
    print(i)

[1100, 'Rice']
[1200, 'Wheat']
[1300, 'Millet']
[1400, 'Sorghum']
[1500, 'Maize']
[1600, 'Barley']
[1990, 'Other cereal']
[2100, 'Cassava']
[2200, 'Potatoe']
[2300, 'Sweet potatoes']
[2400, 'Yams']
[2500, 'Carrots']
[2600, 'Cocoyam']
[2990, 'Other tubers']
[3100, 'Beans']
[3200, 'Lentils']
[3300, 'Peas']
[3400, 'Soybeans']
[3990, 'Other pulses']
[4100, 'Cabbage']
[4200, 'Lettuce']
[4300, 'Spinach-amaranth']
[4400, 'Cassava leaves']
[4500, 'Potatoe leaves']
[4600, 'Moringa']
[4700, 'Sorrel hibiscus, rosselle']
[4990, 'Other leafy vegetables']
[5100, 'Tomatoes']
[5200, 'Cucumber']
[5300, 'Pumpkin']
[5400, 'Eggplant/aubergine']
[5500, 'Zucchini']
[5600, 'Okra']
[5700, 'Pepper']
[5800, 'Onions']
[5990, 'Other non leafy vegs']
[6100, 'Orange']
[6200, 'Lemon']
[6300, 'Grapefruit']
[6990, 'Other citrus fruit']
[7100, 'Sweet banana']
[7200, 'Plantains']
[7300, 'Papaya']
[7400, 'Mango']
[7500, 'Guava']
[7600, 'Pineapple']
[7700, 'Avocado']
[7800, 'Watermelon']
[7900, 'Dates']
[7101, 'Figs']
[71

In [7]:
###this section creates a new GDB
gdb_name = "fGDB_with_coded_values_%s.gdb" % now
output_gdb = os.path.join(temp_path, gdb_name)
arcpy.CreateFileGDB_management(temp_path,gdb_name)

In [8]:
# print(single_choice_derived_other_field )
# print(single_choice_questions_with_other_categories)
# print(all_answers_with_other_option)


In [9]:
##here we and import each table with coded values and description into the GDB
###it can take up to 5 seconds per table - and we normally process hundreds of table.
####for this reason, for testing purposes edit variable max_counter in the first cell of this notebook


importallsheets(coded_values_file, output_gdb)

max_counter: 3000
89 sheets found: calldispo,resp_language,introduction,resp_agree,resp_refusalwhy,resp_gender,hh_admin1,hh_admin2_1,hh_admin2_2,quotareached,hh_agricactivity,hh_gender,hh_education,hh_wealth_water,hh_wealth_toilet,hh_wealth_light,hh_residencetype,hh_maritalstat,hh_age,income_main,income_main_amount_conf,income_main_comp,income_sec,income_sec_amount_conf,income_sec_comp,income_third,income_third_amount_conf,income_third_comp,crp_main,crp_landsize,crp_landright,crp_irrigation,crp_area_change,crp_harv_change,crp_proddif,crp_salesdif,crp_salesprice,crp_proc,crp_proc_state,ls_main,ls_proddif,ls_salesmain,ls_salesdif,ls_salesprice,ls_proc,ls_proc_fac_state,fish_change,fish_proddif,fish_salesmain,fish_salesdif,fish_salesprice,fies,fies_worried,fies_healthy,fies_fewfoods,fies_skipped,fies_ateless,fies_ranout,fies_ranout_hhs,fies_hungry,fies_hungry_hhs,fies_whlday,fies_whlday_hhs,copingstrategies,cs_stress_hh_assets,cs_stress_spent_savings,cs_stress_sold_more_animals,cs_stress_

In [10]:
###it reads the survey file and creates an excel file with all field names of the outputs table.
###then it imports the empty output table template to the GDB, and calls it master_table. this table will be the future master table
###coded values will be enforced there.
#print(field_names_list)

#insert opening fields that come from Geopoll and are not captured by the questionnaire file
opening_fields = [["survey_id",'TEXT'],["operator_id",'TEXT'],["adm0_name",'TEXT'],["adm0_iso3",'TEXT'],["adm1_pcode",'TEXT'],
                  ["adm1_name",'TEXT'],["adm2_pcode",'TEXT'],["adm2_name",'TEXT'],
["survey_created_date",'TEXT'],["survey_date",'DATE'],["survey_date_text",'TEXT'],["total_case_duration",'SHORT']]#,["weight",'DOUBLE']]


#manage opening fields type
opening_fields_names_text = [i[0] for i in opening_fields if i[1] == 'TEXT']
opening_fields_names_short = [i[0] for i in opening_fields if i[1] == 'SHORT']
opening_fields_names_range = [i[0] for i in opening_fields if i[1] == 'RANGE']
opening_fields_names_date = [i[0] for i in opening_fields if i[1] == 'DATE']
opening_fields_names_double = [i[0] for i in opening_fields if i[1] == 'DOUBLE']

text_type_fields += opening_fields_names_text + single_choice_questions_with_other_categories
#text_type_fields += all_answers_with_other_option
range_type_fields += opening_fields_names_range
double_type_fields += opening_fields_names_double
date_type_fields = opening_fields_names_date

opening_field_names = [i[0] for i in opening_fields]
field_names_list = opening_field_names + field_names_list

#some fields in the questionnaire should be removed from the master table
useless_fields = ['resp_agree', 'callbackmessage_en', 'hh_admin1', 'calldispo', 'hh_admin2_1', 'hh_admin2_2', 'quotareached',
                  'resp_refusalwhy', 'resp_whencallback', 'fies','copingstrategies',
                 "introduction",'hdds_confirmation','income_main_amount_conf','income_sec_amount_conf','income_third_amount_conf',
                  'resp_language','survey_created_date','resp_refusalwhy_otherspecify']


s_useless_fields = set(useless_fields)  

field_names_list = [x for x in field_names_list if x not in s_useless_fields]

#adding specific fields in specific places in the table
field_names_list = insert_element_in_list_after_element(field_names_list, "adm3_pcode", "adm2_name")
field_names_list = insert_element_in_list_after_element(field_names_list, "adm3_name", "adm3_pcode") 
field_names_list = insert_element_in_list_after_element(field_names_list, "round", "survey_date_text") 
field_names_list = insert_element_in_list_after_element(field_names_list, "resp_age_rng", "resp_age") 
field_names_list = insert_element_in_list_after_element(field_names_list, "hh_size_rng", "hh_size")
field_names_list = insert_element_in_list_after_element(field_names_list, "hh_age_rng", "hh_age") 
field_names_list = insert_element_in_list_after_element(field_names_list, "ls_num_diff", "ls_num_now") 

field_names_list = insert_element_in_list_after_element(field_names_list, "tot_income", "income_third_comp") #double
field_names_list = insert_element_in_list_after_element(field_names_list, "crp_main_cc", "crp_proc_state") # long
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_rawscore", "fies_whlday_hhs") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_moderate", "fies_rawscore")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_severe", "fies_moderate")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_mod_and_sev", "fies_severe")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_phase_1", "fies_mod_and_sev")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_phase_2", "fies_phase_1")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_phase_3", "fies_phase_2")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_phase_4", "fies_phase_3")
field_names_list = insert_element_in_list_after_element(field_names_list, "fies_phase_5", "fies_phase_4")
field_names_list = insert_element_in_list_after_element(field_names_list, "lcsi", "cs_emergency_hh_migration") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "hdds_score", "hdds_condiments") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "hdds_class", "hdds_score") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "fcs", "fies_phase_5") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "fcg", "fcs") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "hhs", "fcg") #short
field_names_list = insert_element_in_list_after_element(field_names_list, "hhg", "hhs") #short

FCS_fields = ["fcs_cereal_days","fcs_tubers_days","fcs_staple_days","fcs_pulses_days","fcs_fruit_days","fcs_meat_fish_days",
              "fcs_dairy_days","fcs_sugar_days","fcs_oil_days","fcs_cereal_source","fcs_tubers_source","fcs_staple_source",
              "fcs_pulses_source","fcs_fruit_source","fcs_meat_fish_source","fcs_dairy_source","fcs_sugar_source",
              "fcs_oil_source"] #they are all short, so no need to specify the field type


fcs_counter = 0
for FCS_field in FCS_fields:
    if fcs_counter == 0:
        field_names_list = insert_element_in_list_after_element(field_names_list, FCS_field, "fcs") #short
    else:
        field_names_list = insert_element_in_list_after_element(field_names_list, FCS_field, FCS_fields[fcs_counter-1]) #short
    fcs_counter += 1


rcsi_fields = ["rcsi_less_preferred_foods","rcsi_borrowed_food","rcsi_limit_portions","rcsi_restrict_adult_consumption",
               "rcsi_reduce_number_meals","rcsi_score","rcsi_class"] #they are all short, so no need to specify the field type
rcsi_counter = 0
for rcsi_field in rcsi_fields:
    if rcsi_counter == 0:
        field_names_list = insert_element_in_list_after_element(field_names_list, rcsi_field, "lcsi") #short
    else:
        field_names_list = insert_element_in_list_after_element(field_names_list, rcsi_field, rcsi_fields[rcsi_counter-1]) #short
    rcsi_counter += 1


#declaring type of specific fields just added (if not declared, it will be integer)
text_type_fields.append("adm3_pcode")
text_type_fields.append("adm3_name")
text_type_fields.append("resp_age_rng")
text_type_fields.append("hh_size_rng")
text_type_fields.append("hh_age_rng") 
text_type_fields.append("survey_date_text")
double_type_fields.append("ls_num_diff") #double because it can store negative numbers
range_type_fields.append("crp_main_cc")
double_type_fields.append("ls_num_diff")
double_type_fields.extend(["fies_moderate","fies_severe","fies_mod_and_sev","fies_phase_1","fies_phase_2",
                           "fies_phase_3","fies_phase_4","fies_phase_5"])


other_fields_list = ['covid_other','crp_proc_mach_other','crp_proc_owner_other',
                     'crp_proddif_other','crp_saledif_other','crp_seed_other','fish_inputdif_other','fish_proddif_other',
                     'fish_saledif_other','ls_food_supply_other','ls_num_inc_dec_other',
                     'ls_proc_fac_other','ls_proc_other','ls_proddif_other','ls_salesdif_other','need_other','need_received_other']
for other_field in other_fields_list:
    field_names_list = insert_element_in_list_after_element(field_names_list, other_field + "specify", other_field) 
    text_type_fields.append(other_field + "specify")

field_names_list = insert_element_in_list_after_element(field_names_list, 'crp_otherspecify', "crp_main") 
text_type_fields.append("crp_otherspecify")


#insert closing fields that come from Geopoll and are not captured by the questionnaire file
closing_fields = [["weight_base",'DOUBLE'],["weight_quota",'DOUBLE'],["weight_crop_prod",'DOUBLE'],["weight_livestock_prod",'DOUBLE'],
                  ["weight_wealth",'DOUBLE'], ["weight_gender",'DOUBLE'],["weight_educ",'DOUBLE'],["weight_final",'DOUBLE'],
                  ["percentage",'DOUBLE']
                 ,["qc_step0_date",'DATE'],["qc_step1_date",'DATE'],["qc_step1_username",'TEXT'],["qc_step2_date",'DATE'],
                  ["qc_step2_username",'TEXT'],["qc_enumerator",'TEXT'],["qc_method",'TEXT']]

#manage closing fields type
closing_fields_names_text = [i[0] for i in closing_fields if i[1] == 'TEXT']
closing_fields_names_short = [i[0] for i in closing_fields if i[1] == 'SHORT']
closing_fields_names_range = [i[0] for i in closing_fields if i[1] == 'RANGE']
closing_fields_names_date = [i[0] for i in closing_fields if i[1] == 'DATE']
closing_fields_names_double = [i[0] for i in closing_fields if i[1] == 'DOUBLE']

text_type_fields += closing_fields_names_text
#text_type_fields += all_answers_with_other_option
range_type_fields += closing_fields_names_range
double_type_fields += closing_fields_names_double
date_type_fields += closing_fields_names_date

closing_field_names = [i[0] for i in closing_fields]
field_names_list =  field_names_list + closing_field_names

#we need to collect all 'other' fields from select all that apply (they are all followed by another field called 
# '_otherspecify'), and apply yes no domain
#field_names_list
other_fields_multiple_choice_yes_no_domain = []
for f in field_names_list:
    if f[-6:] == '_other':
        other_fields_multiple_choice_yes_no_domain.append(f)

survey_empty_table_df = pd.DataFrame(columns=[field_names_list])
survey_empty_table_xlsx = os.path.join(temp_path, "survey_empty_table_%s.xlsx" % now)
writer = pd.ExcelWriter(survey_empty_table_xlsx, engine='xlsxwriter')
survey_empty_table_df.to_excel(writer, sheet_name="hh_master_table")
writer.save()
#print(field_names_list)

field_names_list


In [11]:
###this section imports the excel file just created into the GDB

importallsheets(survey_empty_table_xlsx, output_gdb)


max_counter: 3000
1 sheets found: hh_master_table
Converting hh_master_table to C:\temp\data_processing_exports\fGDB_with_coded_values_20210802172812.gdb\hh_master_table


In [12]:
##there is always an unwanted field to remove
try:
    arcpy.DeleteField_management(os.path.join(output_gdb, "hh_master_table"),"COL_A")
except:
    print ("field COL_A does not exist")

In [13]:
### This section alters each fields type in the survey master table
###domain related fiels shuld be integer, in order to enforce coded values, except crop_main

##string fields: open ended questions + crop_main
##long fields: range and numbers related questions (age, currency, hh size...)
##short fields: coded values questions

GDB_survey_table = os.path.join(output_gdb,"hh_master_table")
field_names = [f.name for f in arcpy.ListFields(GDB_survey_table)]
counter = 0
for field in field_names:
    if field in field_names_list:
        counter +=1
        print("%s) changing type for field %s" % (counter,field))
        try:
            if field in text_type_fields: #crp_main is associated to text codes since they have possible dots (.) for subcategories
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "TEXT")
            elif field == 'crp_main' or field in range_type_fields: #i.e. currencies
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "LONG")
            elif field in double_type_fields: #i.e. weight
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "DOUBLE")
            elif field in date_type_fields: #i.e. survey date
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "DATE")
            else: #all other fields are short integer (simple  code values)
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "SHORT")
        except:
            pass

1) changing type for field survey_id
2) changing type for field operator_id
3) changing type for field adm0_name
4) changing type for field adm0_iso3
5) changing type for field adm1_pcode
6) changing type for field adm1_name
7) changing type for field adm2_pcode
8) changing type for field adm2_name
9) changing type for field adm3_pcode
10) changing type for field adm3_name
11) changing type for field survey_date
12) changing type for field survey_date_text
13) changing type for field round
14) changing type for field total_case_duration
15) changing type for field resp_age
16) changing type for field resp_age_rng
17) changing type for field resp_gender
18) changing type for field hh_agricactivity
19) changing type for field hh_gender
20) changing type for field hh_education
21) changing type for field hh_wealth_water
22) changing type for field hh_wealth_toilet
23) changing type for field hh_wealth_light
24) changing type for field hh_residencetype
25) changing type for field hh_size
2

348) changing type for field need_fertilizers
349) changing type for field need_pesticides
350) changing type for field need_tools
351) changing type for field need_accesstoirrigationwater
352) changing type for field need_accesstoland
353) changing type for field need_animalfeed
354) changing type for field need_veterinaryservices
355) changing type for field need_veterinaryinputs
356) changing type for field need_animalsalemingarantdprice
357) changing type for field need_restockinganimals
358) changing type for field need_supportforprocessprod
359) changing type for field need_supptransofanimalsorprod
360) changing type for field need_acstomechanisedequipprod
361) changing type for field need_marketingsupport
362) changing type for field need_cashassistance
363) changing type for field need_loans
364) changing type for field need_storageequipmentorfaci
365) changing type for field need_techsupporextensionserv
366) changing type for field need_landrehabilitation
367) changing type fo

In [14]:
def create_and_apply_domain(master_table, domain_table, field_name, domain_already_exists):
    # Process: Create the coded value domain
    domName = '%s_dom' % domain_table
    if not domain_already_exists:
        if domain_table == 'crp_main':
            field_type = "LONG"
        else:
            field_type = "SHORT"
        try:
            arcpy.CreateDomain_management(output_gdb, domName, field_type= field_type)
        except Exception as e:
            print(e)
        # Store all the domain values in a dictionary with the domain code as the "key" 
        # and the domain description as the "value" (domDict[code])

        domDict = make_attribute_dict(domain_table, 'code', 'label')
        #print(domDict)

        #   dict example:  {"CI":"Cast iron", "DI": "Ductile iron", "PVC": "PVC", 
        #                "ACP": "Asbestos concrete", "COP": "Copper"}

        # Process: Add valid material types to the domain
        # use a for loop to cycle through all the domain codes in the dictionary
        for code in domDict:
            try:
                arcpy.AddCodedValueToDomain_management(output_gdb, domName, code, domDict[code])
            except:
                print("AddCodedValueToDomain_management failed")


    arcpy.AssignDomainToField_management(master_table, field_name, domName) 

In [15]:
##THis section of the script searches for all domains tables in the GDB
##then, for each table  it creates a new domain,
###it stores all domain values in a dictionary, it add each code domain, and finally assign the domain to the right field (having same name)

# Set the current workspace
arcpy.env.workspace = output_gdb

# Get and print a list of tables
tables = arcpy.ListTables()
master_table = os.path.join(output_gdb,'hh_master_table')

print("---APPLYING DOMAINS FOR SINGLE CHOICE FIELDS---")
counter = 0
for table in tables:
    if 'master_table' not in table and table not in useless_fields and  'derived_fields' not in table:
        #print(table)
        if table == "yes_no":
            yes_no_table_for_later = table ##we dont have a yes_no field to which apply the domain
        else:
            counter += 1
            print ("%s) Creating and Applying (new) domain for field %s" % (counter, table))
            create_and_apply_domain(master_table, table, table, False) #domain_table has the same name of the field where it should be applied
            if table == 'crp_main':
                print ("%s) Creating and Applying (new) domain for field crp_main_cc" % (counter))
                create_and_apply_domain(master_table, table, 'crp_main_cc', False) #same domains should be applied to this field (derived, calulcated by Amandine)



---APPLYING DOMAINS FOR SINGLE CHOICE FIELDS---
1) Creating and Applying (new) domain for field resp_gender
2) Creating and Applying (new) domain for field hh_agricactivity
3) Creating and Applying (new) domain for field hh_gender
4) Creating and Applying (new) domain for field hh_education
5) Creating and Applying (new) domain for field hh_wealth_water
6) Creating and Applying (new) domain for field hh_wealth_toilet
7) Creating and Applying (new) domain for field hh_wealth_light
8) Creating and Applying (new) domain for field hh_residencetype
9) Creating and Applying (new) domain for field hh_maritalstat
10) Creating and Applying (new) domain for field hh_age
11) Creating and Applying (new) domain for field income_main
12) Creating and Applying (new) domain for field income_main_comp
13) Creating and Applying (new) domain for field income_sec
14) Creating and Applying (new) domain for field income_sec_comp
15) Creating and Applying (new) domain for field income_third
16) Creating and 

In [16]:
print("---APPLYING DOMAINS FOR MULTIPLE CHOICE DERIVED FIELDS---")
yes_no_domain_created = False
counter = 0
for derived_field in all_derived_fieldnames:
    if derived_field not in all_answers_with_other_option: #'other' fields, are strings without domains #changed!
        counter +=1
        if not yes_no_domain_created:
            print ("%s) Creating and applying new YES-NO domain (table %s) for derivate field %s" % (counter, yes_no_table_for_later,derived_field))
            create_and_apply_domain(master_table, yes_no_table_for_later, derived_field, False)
            yes_no_domain_created = True
        else:
            print ("%s) Applying existing YES-NO domain for derivate field %s" % (counter,derived_field))
            create_and_apply_domain(master_table, yes_no_table_for_later, derived_field, True)
            

---APPLYING DOMAINS FOR MULTIPLE CHOICE DERIVED FIELDS---
1) Creating and applying new YES-NO domain (table yes_no) for derivate field covid_goodstransp
2) Applying existing YES-NO domain for derivate field covid_marketclosed
3) Applying existing YES-NO domain for derivate field covid_borderclosed
4) Applying existing YES-NO domain for derivate field covid_stayhome
5) Applying existing YES-NO domain for derivate field covid_gatherings
6) Applying existing YES-NO domain for derivate field covid_processclosed
7) Applying existing YES-NO domain for derivate field covid_none
8) Applying existing YES-NO domain for derivate field covid_dk
9) Applying existing YES-NO domain for derivate field covid_ref
10) Applying existing YES-NO domain for derivate field shock_noshock
11) Applying existing YES-NO domain for derivate field shock_sicknessordeathofhh
12) Applying existing YES-NO domain for derivate field shock_lostemplorwork
13) Applying existing YES-NO domain for derivate field shock_otherint

In [17]:
counter = 0        
for other_field in other_fields_multiple_choice_yes_no_domain:
    counter +=1
    print ("%s) Applying existing YES-NO domain for ''other'' field %s" % (counter,other_field))
    create_and_apply_domain(master_table, yes_no_table_for_later, other_field, True)
    
        
        

        

1) Applying existing YES-NO domain for ''other'' field covid_other
2) Applying existing YES-NO domain for ''other'' field crp_seed_other
3) Applying existing YES-NO domain for ''other'' field crp_proddif_other
4) Applying existing YES-NO domain for ''other'' field crp_saledif_other
5) Applying existing YES-NO domain for ''other'' field crp_proc_mach_other
6) Applying existing YES-NO domain for ''other'' field crp_proc_owner_other
7) Applying existing YES-NO domain for ''other'' field ls_num_inc_dec_other
8) Applying existing YES-NO domain for ''other'' field ls_food_supply_other
9) Applying existing YES-NO domain for ''other'' field ls_proddif_other
10) Applying existing YES-NO domain for ''other'' field ls_salesdif_other
11) Applying existing YES-NO domain for ''other'' field ls_proc_other
12) Applying existing YES-NO domain for ''other'' field ls_proc_fac_other
13) Applying existing YES-NO domain for ''other'' field fish_proddif_other
14) Applying existing YES-NO domain for ''other''

In [18]:
##APPLYING DOMAINS FOR INDICATORS (variables added during the analysis phase and not present in the questionnaire)

#print(indicators_fields) #at the end, this list is used only for copypasting and create the following dictionary


indicators_domains_dict = {'rcsi_less_preferred_foods':[0,1,2,3,4,5,6,7],'rcsi_borrowed_food':[0,1,2,3,4,5,6,7],
                           'rcsi_limit_portions':[0,1,2,3,4,5,6,7],'rcsi_restrict_adult_consumption':[0,1,2,3,4,5,6,7],
                           'rcsi_reduce_number_meals':[0,1,2,3,4,5,6,7],'rcsi_score':[0,56],'rcsi_class':[0,1,2,3],
                           'fies_rawscore':[0,1,2,3,4,5,6,7,8],'lcsi':[0,1,2,3],'hdds_score':[0,12],
                           'hdds_class':[1,2,3],'hhs':[0,6],'hhg':[0,5],'fcs':[0,112],'fcg':[1,2,3],
                          'fcs_cereal_days':[0,7],'fcs_tubers_days':[0,7],'fcs_staple_days':[0,7],'fcs_pulses_days':[0,7],
                           'fcs_fruit_days':[0,7],'fcs_meat_fish_days':[0,7],'fcs_dairy_days':[0,7],'fcs_sugar_days':[0,7],'fcs_oil_days':[0,7],
                          'fcs_cereal_source':[0,10],'fcs_tubers_source':[0,10],'fcs_staple_source':[0,10],'fcs_pulses_source':[0,10],
                           'fcs_fruit_source':[0,10],'fcs_meat_fish_source':[0,10],'fcs_dairy_source':[0,10],'fcs_sugar_source':[0,10],
                           'fcs_oil_source':[0,10]}


for field, rangedomain in indicators_domains_dict.items():
    try:
        domName = field + "_dom"
        # Process: Create the range domain
        print("CreateDomain for derived field %s" % field)
#         try:
        arcpy.CreateDomain_management(output_gdb, domName, field_type = "SHORT", domain_type = "RANGE")
#         except: 
#             print ("Domain %s already exists" % domName)
        
        # Process: Set the minimum and maximum values for the range domain
        #print("SetValueForRangeDomain from %s to %s" % (rangedomain[0], rangedomain[-1]))
        arcpy.SetValueForRangeDomain_management(output_gdb, domName, rangedomain[0], rangedomain[-1])

        # Process: Constrain the fitting rotation angle
        #print("AssignDomainToField")
        arcpy.AssignDomainToField_management(master_table, field, domName)
    except Exception as e: 
        print(e)
        print("Domain creation failed for derived field %s" % field)



CreateDomain for derived field rcsi_less_preferred_foods
CreateDomain for derived field rcsi_borrowed_food
CreateDomain for derived field rcsi_limit_portions
CreateDomain for derived field rcsi_restrict_adult_consumption
CreateDomain for derived field rcsi_reduce_number_meals
CreateDomain for derived field rcsi_score
CreateDomain for derived field rcsi_class
CreateDomain for derived field fies_rawscore
CreateDomain for derived field lcsi
CreateDomain for derived field hdds_score
CreateDomain for derived field hdds_class
CreateDomain for derived field hhs
CreateDomain for derived field hhg
CreateDomain for derived field fcs
CreateDomain for derived field fcg
CreateDomain for derived field fcs_cereal_days
CreateDomain for derived field fcs_tubers_days
CreateDomain for derived field fcs_staple_days
CreateDomain for derived field fcs_pulses_days
CreateDomain for derived field fcs_fruit_days
CreateDomain for derived field fcs_meat_fish_days
CreateDomain for derived field fcs_dairy_days
Crea

In [19]:
print("Adding Global ID")
arcpy.AddGlobalIDs_management(master_table)

Adding Global ID


In [20]:
arcpy.TableToTable_conversion(master_table, temp_path, "hh_master_table_%s.csv" % now)

In [21]:
print("Execution time: ", datetime.now() - startTime)

Execution time:  0:26:59.485545
