In [1]:
import pandas as pd
import xlrd
import arcpy
from pandas import ExcelWriter
from datetime import datetime
import os

In [2]:
startTime = datetime.now()
now = datetime.now().strftime('%Y%m%d%H%M%S')
temp_path = r'C:\temp\data_processing_exports'
#questionnaire_file = r'C:\git\hh_survey\household_questionnaire_geopoll_202105.xlsx' #the questionnaire file that we use for creating the table
questionnaire_file = r'C:\git\hh_survey\household_questionnaire_geopoll_EN_template_20210707_ISO3.xlsx'
coded_values_file = os.path.join(temp_path, "coded_values_%s.xlsx" % now) #intermediary output file with all categories and codes extracted from the questionnaire
writer = pd.ExcelWriter(coded_values_file, engine='xlsxwriter')
field_names_list = []
max_counter = 3000 #for testing purposes, we may need to limit the execution only to some items

In [3]:
def importallsheets(in_excel, out_gdb):
    ###this function import all sheets of an xlsx file into separate tables inside a GDB (with same name)
    print("max_counter: %s" % max_counter)
    counter = 0
    workbook = xlrd.open_workbook(in_excel)
    sheets = [sheet.name for sheet in workbook.sheets()]

    print('{} sheets found: {}'.format(len(sheets), ','.join(sheets)))
    for sheet in sheets:
        counter +=1
        if counter <= max_counter:
            # The out_table is based on the input excel file name
            # a underscore (_) separator followed by the sheet name
            out_table = os.path.join(
                out_gdb,
                arcpy.ValidateTableName(
                    "{0}".format(sheet),
                    out_gdb))

            print('Converting {} to {}'.format(sheet, out_table))

            # Perform the conversion
            arcpy.ExcelToTable_conversion(in_excel, out_table, sheet)
            

def make_attribute_dict(fc, code_field, value_field):
    #this function creates a dictionary based on a GDB table
    attdict = {}
    with arcpy.da.SearchCursor(fc, [code_field, value_field]) as cursor:
        for row in cursor:
            attdict[row[0]] = row[1]
    return attdict

def fix_category_formatting(category):
    #this function improves and standardizes the formatting of the categories' descriptions
     return category.replace("[","(").replace("]",")").replace("(specify)","").replace("/ ",", ").capitalize().replace("adps","ADPs").replace("idp","IDP").replace("covid","COVID").replace(" , ",", ").replace("staplec","staple") 

def insert_element_in_list_after_element(old_list, new_item,after_item): 
    new_list = []
    for each in old_list:
        new_list.append(each)
        if each == after_item:
            new_list.append(new_item)
    return new_list

In [4]:
language_domains = [[1,'English'],[2,'French'],[3,'Spanish'],[4,'Arabic'],[5,'Portuguese'],[6,'Dari'],[7,'Pashto'],[8,'Sangho'],[9,'Shona'],[10,'Ndebele'],[11,'Swahili'],[12,'Tshiluba'],[13,'Lingala'],[14,'Kikongo'],[15,'Fula'],[16,'Songhai'],[17,'Dogon'],[18,'Hausa'],[19,'Zarma'],[20,'Kanouri'],[21,'Somali'],[22,'Xichangana'],[23,'Xichope'],[24,'Ndau'],[25,'Xicena'],[26,'Machuabo'],[27,'Chichewa'],[28,'Lomwe'],[29,'Macua'],[30,'Makonde']]

In [5]:
##this section of the script reads the survey excel file and creates an excel file with multiple sheets:
### each sheet contains the coded value and description for a "Single choice" or "Open Ended-Select All That Apply" question.
###moreover, it creates and populates several lists that will be used later for defining each field of the final table names, types and domains

print("Opening questionnaire DF")
quest_df = pd.read_excel(open(questionnaire_file, 'rb'), sheet_name='survey',skiprows=2)
#create a list of all possible numbering
numbering = ["%s)" % n for n in range(1,200)] ## 1), 2), ... 200)
# initialize list of lists that will store the results
dict_derived_fieldnames = {} #this dict will group all derived fields in case of "Select All That Apply" type of questions
field_names_list = [] ##this list will contain all fields of the final table
text_type_fields = [] ##this list will contain all fields of the final table with TEXT type
range_type_fields = [] ##this list will contain all fields of the final table storing RANGE data  (will be LONG type)
double_type_fields = [] ##this list will contain all fields of the final table storing DOUBLE data  
##iterate the following for each row (so each question of the questionnaire)
all_derived_fieldnames = []
all_answers_with_other_option = [] ##this list will contain all "Other: specify" fields in Select all that apply type of questions
single_choice_questions_with_other_categories = [] ##this list will contain all "Other: specify" fields in single choice type of questions
quest_df = quest_df[quest_df['Suggested Qname'].notna()]
for index, row in quest_df.iterrows():
    try:
        first_derived_fieldname = "" #the name of the first derived field will be the main of the domain table
        derived_fieldnames = []
        codes_and_labels = []
        categories = str(row['English']).replace("\t","")
        question_name = row['Suggested Qname'].strip()  #Q Name
        
        #renaming some fields
        if question_name == 'language2':
            question_name = 'language'
        if question_name == 'opt_in_date':
            question_name = 'survey_date_time'
        
        question_type = row['Q Type']
        programming_instructions = row['Programming Instructions'] #this field contains coded values for crop_main
        print("\n\n----%s----" % question_name)
        #only for questions with pre-defined categories need domains
        if question_type in ("StartRecording","Single Choice","Open Ended-Single Choice", "Open Ended - Single Choice", "Open Ended-Select All That Apply",
                             "Select All That Apply","Open Ended - Select All That Apply "):
            if question_name == 'crp_main': #for this question only, coded values should be taken from field programming_instructions
                programming_lines = programming_instructions.splitlines()
                for programming_line in programming_lines:
                    if ")" in programming_line:
                        #print(programming_line)
                        index, category = programming_line.split(")")
                        category = fix_category_formatting(category)
                        codes_and_labels.append([index, category])
            else:
                #find all numbering present in the category string
                numbering_in_text = [n for n in numbering if n in categories]
                #print(numbering_in_text)
                ##the following loop creates a list "codes_and_labels" with all available codes&labels for each question
                for index in range(0,len(numbering_in_text)):
                    start = categories.find(numbering_in_text[index]) + len(numbering_in_text[index])
                    try:
                        end = categories.find(numbering_in_text[index + 1])
                        substring = categories[start:end].strip()
                    except:
                        # it fails during the last loop -> the last option is usually at the end of the string
                        substring = categories[start:].strip()
                    #print(substring)
                    category = fix_category_formatting(substring)
                    codes_and_labels.append([index +1, category])
                print("codes_and_labels: ", codes_and_labels)
                if codes_and_labels == [[1, 'Yes'], [2, 'No'], [3, "Don't know"], [4, 'Refused']]:
                    #in order to accomodate Amandine request, Yes No questions need to be reclassified
                    codes_and_labels = [[0, 'No'], [1, 'Yes'], [2, "Don't know"], [3, 'Refused']]
                if question_name == 'language': #domains for languages are not included in the questionnaire
                    codes_and_labels = language_domains
                
            if question_type not in ["Open Ended-Select All That Apply","Select All That Apply","Open Ended - Select All That Apply "]:
                #so questions with NO derived fields
                field_names_list.append(question_name.strip())
                codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                codes_and_labels_df.to_excel(writer, sheet_name=question_name)
                ##check if one of the option is other -> the question need 1 additional field for keeping the specified text
                if any(s[1].lower().strip() == "other" for s in codes_and_labels):
                    other_categories = [s[1] for s in codes_and_labels if s[1].lower().strip() == "other" ]
                    single_choice_derived_other_field = "%s_otherspecify" % question_name.strip()
                    single_choice_questions_with_other_categories.append(single_choice_derived_other_field)
                    field_names_list.append(single_choice_derived_other_field)
                    
            else:
                #so questions with derived fields
                numbering_in_qname = [n for n in numbering if n in question_name]
                for index in range(0, len(numbering_in_qname)):
                    start = question_name.find(numbering_in_qname[index]) + len(numbering_in_qname[index])
                    try:
                        end = question_name.find(numbering_in_qname[index + 1])
                        derived_field_name = question_name[start:end].strip()
                    except:
                        # it fails during the last loop -> the last option is usually at the end of the string
                        derived_field_name = question_name[start:].strip()
                    all_derived_fieldnames.append(derived_field_name)
                    derived_fieldnames.append(derived_field_name)
                    field_names_list.append(derived_field_name)
                    if derived_field_name[-6:] == "_other": ##this field will need to be STRING - with no domain (since it's a 'other specify')
                        all_answers_with_other_option.append(derived_field_name)
                    if index == 0:
                        first_derived_fieldname = derived_field_name
                        codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                        #codes_and_labels_df.to_excel(writer, sheet_name=derived_field_name) #we don't need domain table for derived fields, since they will use YES NOT domain table
                dict_derived_fieldnames[first_derived_fieldname] = all_derived_fieldnames
        elif question_type == "Range":
            #these questions will be associated to LONG type fields
            field_names_list.append(question_name.strip())
            range_type_fields.append(question_name)
        elif question_type == "Open Ended":
            #these questions will be associated to TEXT type fields
            if not pd.isnull(question_name): #NaN rows we want to skip (i.e. OptIn question without a name in the survey)
                field_names_list.append(question_name)
                text_type_fields.append(question_name)
        else:
            print("QUESTION SKIPPED ---------", question_name, question_type)
    except:
        print("failed for some reasons")

#adding Yes No table (for derived fields domain)
d = {1:"Yes",0:"No"}
yesno_df = pd.DataFrame(d.items(), columns=['code', 'label'])
yesno_df.to_excel(writer, sheet_name='yes_no')
print("Saving codes and labels %s" % coded_values_file)

#print(all_answers_with_other_option)

#creating additional sheet with all derived fields, that will be used for a script that ensures that also these values are within the domains in the output table, in a later stage
list_of_yes_no_fields = []
for derived_field in all_derived_fieldnames:
    if derived_field not in all_answers_with_other_option:
        list_of_yes_no_fields.append(derived_field)
        
derived_fields_df = pd.DataFrame(list_of_yes_no_fields) 
#print(derived_fields_df.head())
derived_fields_df.to_excel(writer, sheet_name='derived_fields') 

# Close the Pandas Excel writer and output the Excel file.
writer.save()

Opening questionnaire DF


----calldispo----
codes_and_labels:  [[1, 'Someone answers'], [2, 'Answering machine'], [3, 'No answer'], [4, 'Hang up or refusal'], [5, 'Call back'], [6, 'Under review'], [7, 'Disconnected']]


----calldispo_answeringmachine----
QUESTION SKIPPED --------- calldispo_answeringmachine nan


----calldispo_noanswer----
QUESTION SKIPPED --------- calldispo_noanswer nan


----calldispo_underreview----
QUESTION SKIPPED --------- calldispo_underreview nan


----calldispo_disconnected----
QUESTION SKIPPED --------- calldispo_disconnected nan


----resp_language----
codes_and_labels:  [[1, 'English'], [2, 'Other options (add as many as necessary)']]


----introduction----
codes_and_labels:  [[1, 'Continue']]


----resp_agree----
codes_and_labels:  [[1, 'Yes'], [2, 'Not now but another time in the week'], [3, 'No']]


----resp_refusalwhy----
codes_and_labels:  [[1, 'Not interested'], [2, 'Do not want to be recorded'], [3, 'Other ']]


----resp_whencallback----


----cal

codes_and_labels:  [[1, 'Farmer, production and sale of staple crops'], [2, 'Farmer, production and sale of vegetable or fruit'], [3, 'Farmer, production and sale of cashcrops'], [4, 'Farmer, production and sale of livestock and livestock products'], [5, 'Farmer, production and sale of fish'], [6, 'Collection and sale of forestry or bush products'], [7, 'Informal agricultural trade excluding producers'], [8, 'Formal agricultural trade excluding producers'], [9, 'Daily wage on farms and other casual employment in agricultural sector'], [10, 'Stable employment in agricultural sector'], [11, 'Non-agricultural self-employed or liberal profession, doctor, architect, lawyer, including restaurant'], [12, 'Off-farm daily wages and other non-agricultural casual employment'], [13, 'Stable employment in non-agricultural sector'], [14, 'Public employment'], [15, 'Income not derived from work, charity'], [16, 'Income not derived from work, welfare transfer, pension, humanitarian aid'], [17, 'Income

codes_and_labels:  [[1, 'Mill'], [2, 'Thresher'], [3, 'Drying facilities'], [4, 'None'], [5, 'Other '], [6, "Don't know"], [7, 'Refused']]


----crp_proc_owner_

1) crp_proc_owner_yours
2) crp_proc_owner_farmin
3) crp_proc_owner_farmout
4) crp_proc_owner_millin
5) crp_proc_owner_millout
6) crp_proc_owner_other
7) crp_proc_owner_dk
8) crp_proc_owner_ref----
codes_and_labels:  [[1, 'Yourself'], [2, 'Farm cooperative within community'], [3, 'Farm cooperative outside of community'], [4, 'Private shop or mill within community'], [5, 'Private mill outside community'], [6, 'Other '], [7, "Don't know"], [8, 'Refused']]


----crp_proc_state----
codes_and_labels:  [[1, 'Yes'], [2, 'No'], [3, "Don't know"], [4, 'Refused']]


----ls_main----
codes_and_labels:  [[1, 'Cattle (cow, beef, veal, yak, buffalo)'], [2, 'Goats'], [3, 'Sheeps'], [4, 'Swine'], [5, 'Equine (donkey, horse, etc)'], [6, 'Small domesticated mammals (rabbits, minks, guinea pigs, etc)'], [7, 'Poultry (chicken, guineafowl, duck, etc

codes_and_labels:  [[1, 'Fresh fish (any kind)'], [2, 'Dry fish (any kind)'], [3, 'Smoked fish'], [4, 'Seafood (any kind)'], [5, 'Other '], [6, "Don't know"], [7, 'Refused']]


----fish_salesdif_1----
codes_and_labels:  [[1, 'Yes'], [2, 'No'], [3, "Don't know"], [4, 'Refused']]


----fish_saledif_

1) fish_saledif_smallerprofits
2) fish_saledif_damageandlosses
3) fish_saledif_lowdemand
4) fish_saledif_lowprices
5) fish_saledif_processing
6) fish_saledif_other
7) fish_saledif_dk
8) fish_saledif_ref----
codes_and_labels:  [[1, 'Higher marketing costs (such as transportation costs, fuel costs, etc)'], [2, 'Damage and losses due to delay or inability to physically access markets (including limited storage capacity and closure of markets)'], [3, 'Usual traders or local customers are not buying as much as usual'], [4, 'Prices are too low'], [5, 'Difficulties to process product (lack of access to processing inputs, equipment, etc)'], [6, 'Other '], [7, "Don't know"], [8, 'Refused']]


----fis

codes_and_labels:  [[1, 'Yes'], [2, 'No - because it wasn’t necessary'], [3, 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it'], [4, 'Not applicable'], [5, "Don't know"], [6, 'Refused']]


----hdds

1) hdds_cereals
2) hdds_rootstubers
3) hdds_vegetables
4) hdds_fruits
5) hdds_meat
6) hdds_eggs
7) hdds_fish
8) hdds_legumes
9) hdds_milkdairy
10) hdds_oils
11)hdds_sugar
12) hdds_condiments----
codes_and_labels:  [[1, 'Cereals (sorghum, millet, corn, wheat, rice, spaghetti, bread or insert any other locally available grain)'], [2, 'Roots and tubers (white sweet potato, potato, yam, cassava or other tubers)'], [3, 'Vegetables (all)'], [4, 'Fruits (all)'], [5, 'Meat, poultry and offal (beef, pork, lamb, goat, rabbit, wild game, chicken, duck, liver, kidney, heart, or other organ meats)'], [6, 'Eggs'], [7, 'Fish or shellfish (fresh or dried fish, shellfish)'], [8, 'Legumes and nuts (beans, peas, peanut, lentils, almond

In [6]:
###this section creates a new GDB
gdb_name = "fGDB_with_coded_values_%s.gdb" % now
output_gdb = os.path.join(temp_path, gdb_name)
arcpy.CreateFileGDB_management(temp_path,gdb_name)

In [7]:
# print(single_choice_derived_other_field )
# print(single_choice_questions_with_other_categories)
# print(all_answers_with_other_option)


In [8]:
##here we and import each table with coded values and description into the GDB
###it can take up to 5 seconds per table - and we normally process hundreds of table.
####for this reason, for testing purposes edit variable max_counter in the first cell of this notebook


importallsheets(coded_values_file, output_gdb)

max_counter: 3000
89 sheets found: calldispo,resp_language,introduction,resp_agree,resp_refusalwhy,resp_gender,hh_admin1,hh_admin2_1,hh_admin2_2,quotareached,hh_agricactivity,hh_gender,hh_education,hh_wealth_water,hh_wealth_toilet,hh_wealth_light,hh_residencetype,hh_maritalstat,hh_age,income_main,income_main_amount_conf,income_main_comp,income_sec,income_sec_amount_conf,income_sec_comp,income_third,income_third_amount_conf,income_third_comp,crp_main,crp_landsize,crp_landright,crp_irrigation,crp_area_change,crp_harv_change,crp_proddif,crp_salesdif,crp_salesprice,crp_proc,crp_proc_state,ls_main,ls_proddif,ls_salesmain,ls_salesdif,ls_salesprice,ls_proc,ls_proc_fac_state,fish_change,fish_proddif,fish_salesmain,fish_salesdif_1,fish_salesprice,fies,fies_worried,fies_healthy,fies_fewfoods,fies_skipped,fies_ateless,fies_ranout,fies_ranout_hhs,fies_hungry,fies_hungry_hhs,fies_whlday,fies_whlday_hhs,copingstrategies,cs_stress_hh_assets,cs_stress_spent_savings,cs_stress_sold_more_animals,cs_stres

Converting fies_ranout_hhs to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\fies_ranout_hhs
Converting fies_hungry to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\fies_hungry
Converting fies_hungry_hhs to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\fies_hungry_hhs
Converting fies_whlday to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\fies_whlday
Converting fies_whlday_hhs to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\fies_whlday_hhs
Converting copingstrategies to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\copingstrategies
Converting cs_stress_hh_assets to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\cs_stress_hh_assets
Converting cs_stress_spent_savings to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\cs_stress_spent_savings
Converting cs_stress_sold_more

In [9]:
###it reads the survey file and creates an excel file with all field names of the outputs table.
###then it imports the empty output table template to the GDB, and calls it master_table. this table will be the future master table
###coded values will be enforced there.
#print(field_names_list)
#insert opening fields that come from Geopoll and are not captured by the questionnaire file
opening_fields = [["survey_id",'TEXT'],["operator_id",'TEXT'],["adm0_name",'TEXT'],["adm0_ISO3",'TEXT'],["adm1_pcode",'TEXT'], ["adm1_name",'TEXT'],["adm2_pcode",'TEXT'],["adm2_name",'TEXT'],
["survey_created_date",'TEXT'],["opt_in_date",'TEXT'],["total_case_duration",'TEXT'],["weight",'DOUBLE']]


#manage opening fields type
opening_fields_names_text = [i[0] for i in opening_fields if i[1] == 'TEXT']
opening_fields_names_short = [i[0] for i in opening_fields if i[1] == 'SHORT']
opening_fields_names_range = [i[0] for i in opening_fields if i[1] == 'RANGE']
opening_fields_names_date = [i[0] for i in opening_fields if i[1] == 'DATE']
opening_fields_names_double = [i[0] for i in opening_fields if i[1] == 'DOUBLE']

text_type_fields += opening_fields_names_text + single_choice_questions_with_other_categories
#text_type_fields += all_answers_with_other_option
range_type_fields += opening_fields_names_range
double_type_fields += opening_fields_names_double

opening_field_names = [i[0] for i in opening_fields]
field_names_list = opening_field_names + field_names_list

#some fields in the questionnaire should be removed from the master table
useless_fields = ['resp_agree', 'callbackmessage_en', 'hh_admin1', 'calldispo', 'hh_admin2_1', 'hh_admin2_2', 'quotareached', 'resp_refusalwhy', 'resp_whencallback', 'fies','copingstrategies',
                 "introduction",'hdds_confirmation','income_main_amount_conf','income_sec_amount_conf','income_third_amount_conf', 'resp_language','survey_created_date']


s_useless_fields = set(useless_fields)  

field_names_list = [x for x in field_names_list if x not in s_useless_fields]

#adding specific fields in specific places in the table
field_names_list = insert_element_in_list_after_element(field_names_list, "adm3_pcode", "adm2_name")
field_names_list = insert_element_in_list_after_element(field_names_list, "adm3_name", "adm3_pcode") 
field_names_list = insert_element_in_list_after_element(field_names_list, "round", "opt_in_date") 
field_names_list = insert_element_in_list_after_element(field_names_list, "percent", "weight") 
field_names_list = insert_element_in_list_after_element(field_names_list, "resp_age_rng", "resp_age") 
field_names_list = insert_element_in_list_after_element(field_names_list, "hh_size_rng", "hh_size")
field_names_list = insert_element_in_list_after_element(field_names_list, "hh_age_rng", "hh_age") 
field_names_list = insert_element_in_list_after_element(field_names_list, "ls_num_diff", "ls_num_now") 


other_fields_list = ['covid_other','crp_irrigation_other','crp_landright_other','crp_proc_mach_other','crp_proc_owner_other',
                     'crp_proddif_other','crp_saledif_other','crp_seed_other','fish_inputdif_other','fish_proddif_other',
                     'fish_saledif_other','fish_salesmain_other','ls_food_supply_other','ls_main_other','ls_num_inc_dec_other',
                     'ls_proc_fac_other','ls_proc_other','ls_proddif_other','ls_salesdif_other','need_other','need_received_other']
for other_field in other_fields_list:
    field_names_list = insert_element_in_list_after_element(field_names_list, other_field + "specify", other_field) 
    text_type_fields.append(other_field + "specify")

field_names_list = insert_element_in_list_after_element(field_names_list, 'crp_otherspecify', "crp_main") 
text_type_fields.append("crp_otherspecify")


#declaring type of specific fields just added (if not declared, it will be integer)
text_type_fields.append("adm3_pcode")
text_type_fields.append("adm3_name")
text_type_fields.append("resp_age_rng")
text_type_fields.append("hh_size_rng")
text_type_fields.append("hh_age_rng")
double_type_fields.append("percent")
double_type_fields.append("ls_num_diff") #double because it can store negative numbers

survey_empty_table_df = pd.DataFrame(columns=[field_names_list])
survey_empty_table_xlsx = os.path.join(temp_path, "survey_empty_table_%s.xlsx" % now)
writer = pd.ExcelWriter(survey_empty_table_xlsx, engine='xlsxwriter')
survey_empty_table_df.to_excel(writer, sheet_name="hh_master_table")
writer.save()
#print(field_names_list)

In [19]:
field_names_list

['survey_id', 'operator_id', 'adm0_name', 'adm0_ISO3', 'adm1_pcode', 'adm1_name', 'adm2_pcode', 'adm2_name', 'adm3_pcode', 'adm3_name', 'opt_in_date', 'round', 'total_case_duration', 'weight', 'percent', 'resp_refusalwhy_otherspecify', 'resp_age', 'resp_age_rng', 'resp_gender', 'hh_agricactivity', 'hh_gender', 'hh_education', 'hh_wealth_water', 'hh_wealth_toilet', 'hh_wealth_light', 'hh_residencetype', 'hh_size', 'hh_size_rng', 'hh_maritalstat', 'hh_age', 'hh_age_rng', 'income_main', 'income_main_amount', 'income_main_comp', 'income_sec', 'income_sec_amount', 'income_sec_comp', 'income_third', 'income_third_amount', 'income_third_comp', 'covid_goodstransp', 'covid_marketclosed', 'covid_borderclosed', 'covid_stayhome', 'covid_gatherings', 'covid_processclosed', 'covid_other', 'covid_otherspecify', 'covid_none', 'covid_dk', 'covid_ref', 'shock_noshock', 'shock_sicknessordeathofhh', 'shock_lostemplorwork', 'shock_otherintrahhshock', 'shock_higherfoodprices', 'shock_higherfuelprices', 'sho

In [11]:
###this section imports the excel file just created into the GDB

importallsheets(survey_empty_table_xlsx, output_gdb)


max_counter: 3000
1 sheets found: hh_master_table
Converting hh_master_table to C:\temp\data_processing_exports\fGDB_with_coded_values_20210708151543.gdb\hh_master_table


In [12]:
##there is always an unwanted field to remove
try:
    arcpy.DeleteField_management(os.path.join(output_gdb, "hh_master_table"),"COL_A")
except:
    print ("field COL_A does not exist")

In [13]:
### This section alters each fields type in the survey master table
###domain related fiels shuld be integer, in order to enforce coded values, except crop_main

##string fields: open ended questions + crop_main
##long fields: range and numbers related questions (age, currency, hh size...)
##short fields: coded values questions

GDB_survey_table = os.path.join(output_gdb,"hh_master_table")
field_names = [f.name for f in arcpy.ListFields(GDB_survey_table)]
counter = 0
for field in field_names:
    if field in field_names_list:
        counter +=1
        print("%s) changing type for field %s" % (counter,field))
        try:
            if field == 'crp_main' or field in text_type_fields: #crp_main is associated to text codes since they have possible dots (.) for subcategories
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "TEXT")
            elif field in range_type_fields: #i.e. currencies
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "LONG")
            elif field in double_type_fields: #i.e. weight
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "DOUBLE")
            else: #all other fields are short integer (simple  code values)
                arcpy.management.AlterField(GDB_survey_table, field, field_type = "SHORT")
        except:
            pass

1) changing type for field survey_id
2) changing type for field operator_id
3) changing type for field adm0_name
4) changing type for field adm0_ISO3
5) changing type for field adm1_pcode
6) changing type for field adm1_name
7) changing type for field adm2_pcode
8) changing type for field adm2_name
9) changing type for field adm3_pcode
10) changing type for field adm3_name
11) changing type for field opt_in_date
12) changing type for field round
13) changing type for field total_case_duration
14) changing type for field weight
15) changing type for field percent
16) changing type for field resp_refusalwhy_otherspecify
17) changing type for field resp_age
18) changing type for field resp_age_rng
19) changing type for field resp_gender
20) changing type for field hh_agricactivity
21) changing type for field hh_gender
22) changing type for field hh_education
23) changing type for field hh_wealth_water
24) changing type for field hh_wealth_toilet
25) changing type for field hh_wealth_light

176) changing type for field ls_proddif_constaccesstopasture
177) changing type for field ls_proddif_constaccesstowater
178) changing type for field ls_proddif_difaccessvetserv
179) changing type for field ls_proddif_difaccessvetinp
180) changing type for field ls_proddif_diseases
181) changing type for field ls_proddif_theftorinsecurity
182) changing type for field ls_proddif_pooraccesstolmarket
183) changing type for field ls_proddif_noaccesstocredit
184) changing type for field ls_proddif_lackoflabour
185) changing type for field ls_proddif_other
186) changing type for field ls_proddif_otherspecify
187) changing type for field ls_proddif_dk
188) changing type for field ls_proddif_ref
189) changing type for field ls_salesmain
190) changing type for field ls_salesdif
191) changing type for field ls_salesdif_smallerprofits
192) changing type for field ls_salesdif_damageandlosses
193) changing type for field ls_salesdif_lowdemand
194) changing type for field ls_salesdif_lowprices
195) c

342) changing type for field callback
343) changing type for field language


In [14]:
def create_and_apply_domain(master_table, domain_table, field_name, domain_already_exists):
    # Process: Create the coded value domain
    domName = '%s_dom' % domain_table
    if not domain_already_exists:
        if domain_table == 'crp_main':
            field_type = "TEXT"
        else:
            field_type = "SHORT"
        try:
            arcpy.CreateDomain_management(output_gdb, domName, field_type= field_type)
        except:
            print('domain exists')
        # Store all the domain values in a dictionary with the domain code as the "key" 
        # and the domain description as the "value" (domDict[code])

        domDict = make_attribute_dict(domain_table, 'code', 'label')
        print(domDict)

        #   dict example:  {"CI":"Cast iron", "DI": "Ductile iron", "PVC": "PVC", 
        #                "ACP": "Asbestos concrete", "COP": "Copper"}

        # Process: Add valid material types to the domain
        # use a for loop to cycle through all the domain codes in the dictionary
        for code in domDict:
            try:
                arcpy.AddCodedValueToDomain_management(output_gdb, domName, code, domDict[code])
            except:
                print("AddCodedValueToDomain_management failed")


    arcpy.AssignDomainToField_management(master_table, field_name, domName) 

In [15]:
##THis section of the script searches for all domains tables in the GDB
##then, for each table  it creates a new domain,
###it stores all domain values in a dictionary, it add each code domain, and finally assign the domain to the right field (having same name)

# Set the current workspace
arcpy.env.workspace = output_gdb

# Get and print a list of tables
tables = arcpy.ListTables()
master_table = os.path.join(output_gdb,'hh_master_table')

print("---APPLYING DOMAINS FOR SINGLE CHOICE FIELDS---")
counter = 0
for table in tables:
    if 'master_table' not in table and table not in useless_fields and  'derived_fields' not in table:
        #print(table)
        if table == "yes_no":
            yes_no_table_for_later = table ##we dont have a yes_no field to which apply the domain
        else:
            counter += 1
            print ("%s) Creating and Applying (new) domain for field %s" % (counter, table))
            create_and_apply_domain(master_table, table, table, False) #domain_table has the same name of the field where it should be applied




---APPLYING DOMAINS FOR SINGLE CHOICE FIELDS---
1) Creating and Applying (new) domain for field resp_gender
{1: 'Male', 2: 'Female', 3: "Don't know", 4: 'Refused'}
2) Creating and Applying (new) domain for field hh_agricactivity
{1: 'Yes - crop production', 2: 'Yes - livestock production', 3: 'Yes - both crop and livestock production', 4: 'No', 5: "Don't know", 6: 'Refused'}
3) Creating and Applying (new) domain for field hh_gender
{1: 'Male', 2: 'Female', 3: "Don't know", 4: 'Refused'}
4) Creating and Applying (new) domain for field hh_education
{1: 'None or did not complete primary school', 2: 'Completed primary school', 3: 'Completed secondary school', 4: 'Completed higher education (university, college) degree', 5: 'Islamic or other religious education', 6: "Don't know", 7: 'Refused'}
5) Creating and Applying (new) domain for field hh_wealth_water
{1: 'Private tap from piped water', 2: 'Public tap', 3: 'Protected well', 4: 'Bottled water', 5: 'Other safe source', 6: 'River', 7: 'Un

{'1,1': 'Rice', '1,2': 'Wheat', '1,3': 'Millet', '1,4': 'Sorghum', '1,5': 'Maize', '1,6': 'Barley', '1,99': 'Other cereal', '2,1': 'Cassava', '2,2': 'Potatoe', '2,3': 'Sweet potatoes', '2,4': 'Yams', '2,5': 'Carrots', '2,6': 'Cocoyam', '2,99': 'Other tubers', '3,1': 'Beans', '3,2': 'Lentils', '3,3': 'Peas', '3,4': 'Soybeans', '3,99': 'Other pulses', '4,1': 'Cabbage', '4,2': 'Lettuce', '4,3': 'Spinach-amaranth', '4,4': 'Cassava leaves', '4,5': 'Potatoe leaves', '4,6': 'Moringa', '4,7': 'Sorrel hibiscus', '4,8': 'Rosselle', '4,99': 'Other leafy vegetables', '5,1': 'Tomatoes', '5,2': 'Cucumber', '5,3': 'Pumpkin', '5,4': 'Eggplant/aubergine', '5,5': 'Zucchini', '5,6': 'Okra', '5,7': 'Pepper', '5,8': 'Onions', '5,99': 'Other non leafy vegs', '6,1': 'Orange', '6,2': 'Lemon', '6,3': 'Grapefruit', '6,99': 'Other citrus fruit', '7,1': 'Sweet banana', '7,2': 'Plantains', '7,3': 'Papaya', '7,4': 'Mango', '7,5': 'Guava', '7,6': 'Pineapple', '7,7': 'Avocado', '7,8': 'Water melon', '7,9': 'Dates', '

{1: 'Yes', 2: 'No - because it wasn’t necessary', 3: 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it', 4: 'Not applicable', 5: "Don't know", 6: 'Refused'}
55) Creating and Applying (new) domain for field cs_stress_borrowed_or_helped
{1: 'Yes', 2: 'No - because it wasn’t necessary', 3: 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it', 4: 'Not applicable', 5: "Don't know", 6: 'Refused'}
56) Creating and Applying (new) domain for field cs_stress_credit
{1: 'Yes', 2: 'No - because it wasn’t necessary', 3: 'No - because you already sold those assets or did this activity within the last 12 months and you cannot continue to do it', 4: 'Not applicable', 5: "Don't know", 6: 'Refused'}
57) Creating and Applying (new) domain for field cs_stress_borrowed_money
{1: 'Yes', 2: 'No - because it wasn’t necessary', 3: 'No - because you already sold those

In [16]:

print("---APPLYING DOMAINS FOR MULTIPLE CHOICE DERIVED FIELDS---")
yes_no_domain_created = False
counter = 0
for derived_field in all_derived_fieldnames:
    if derived_field not in all_answers_with_other_option: #'other' fields, are strings without domains
        counter +=1
        if not yes_no_domain_created:
            print ("%s) Creating and applying new YES-NO domain (table %s) for derivate field %s" % (counter, yes_no_table_for_later,derived_field))
            create_and_apply_domain(master_table, yes_no_table_for_later, derived_field, False)
            yes_no_domain_created = True
        else:
            print ("%s) Applying existing YES-NO domain for derivate field %s" % (counter,derived_field))
            create_and_apply_domain(master_table, yes_no_table_for_later, derived_field, True)
        
        

        

---APPLYING DOMAINS FOR MULTIPLE CHOICE DERIVED FIELDS---
1) Creating and applying new YES-NO domain (table yes_no) for derivate field covid_goodstransp
{1: 'Yes', 0: 'No'}
2) Applying existing YES-NO domain for derivate field covid_marketclosed
3) Applying existing YES-NO domain for derivate field covid_borderclosed
4) Applying existing YES-NO domain for derivate field covid_stayhome
5) Applying existing YES-NO domain for derivate field covid_gatherings
6) Applying existing YES-NO domain for derivate field covid_processclosed
7) Applying existing YES-NO domain for derivate field covid_none
8) Applying existing YES-NO domain for derivate field covid_dk
9) Applying existing YES-NO domain for derivate field covid_ref
10) Applying existing YES-NO domain for derivate field shock_noshock
11) Applying existing YES-NO domain for derivate field shock_sicknessordeathofhh
12) Applying existing YES-NO domain for derivate field shock_lostemplorwork
13) Applying existing YES-NO domain for derivate 

107) Applying existing YES-NO domain for derivate field ls_proddif_dk
108) Applying existing YES-NO domain for derivate field ls_proddif_ref
109) Applying existing YES-NO domain for derivate field ls_salesdif_smallerprofits
110) Applying existing YES-NO domain for derivate field ls_salesdif_damageandlosses
111) Applying existing YES-NO domain for derivate field ls_salesdif_lowdemand
112) Applying existing YES-NO domain for derivate field ls_salesdif_lowprices
113) Applying existing YES-NO domain for derivate field ls_salesdif_slaughterhouse
114) Applying existing YES-NO domain for derivate field ls_salesdif_processing
115) Applying existing YES-NO domain for derivate field ls_salesdif_dk
116) Applying existing YES-NO domain for derivate field ls_salesdif_ref
117) Applying existing YES-NO domain for derivate field ls_proc_slaughter
118) Applying existing YES-NO domain for derivate field ls_proc_meatprocess
119) Applying existing YES-NO domain for derivate field ls_proc_dairyprocess
120)

In [17]:
print("Adding Global ID")
arcpy.AddGlobalIDs_management(master_table)

Adding Global ID


In [18]:
print("Execution time: ", datetime.now() - startTime)

Execution time:  0:38:21.586874
