In [253]:
import pandas as pd
import xlrd
import arcpy
from pandas import ExcelWriter
from datetime import datetime
import os

In [230]:
def importallsheets(in_excel, out_gdb):
    ###this function import all sheets of an xlsx file into separate tables inside a GDB (with same name)
    print("max_counter: %s" % max_counter)
    counter = 0
    workbook = xlrd.open_workbook(in_excel)
    sheets = [sheet.name for sheet in workbook.sheets()]

    print('{} sheets found: {}'.format(len(sheets), ','.join(sheets)))
    for sheet in sheets:
        counter +=1
        if counter <= max_counter:
            # The out_table is based on the input excel file name
            # a underscore (_) separator followed by the sheet name
            out_table = os.path.join(
                out_gdb,
                arcpy.ValidateTableName(
                    "{0}".format(sheet),
                    out_gdb))

            print('Converting {} to {}'.format(sheet, out_table))

            # Perform the conversion
            arcpy.ExcelToTable_conversion(in_excel, out_table, sheet)
            

def make_attribute_dict(fc, code_field, value_field):
    #this function creates a dictionary based on a GDB table
    attdict = {}
    with arcpy.da.SearchCursor(fc, [code_field, value_field]) as cursor:
        for row in cursor:
            attdict[row[0]] = row[1]
    return attdict

def fix_category_formatting(category):
    #this function improves and standardizes the formatting of the categories' descriptions
     return category.replace("[","(").replace("]",")").replace("(specify)","").replace("/ ",", ").capitalize().replace("adps","ADPs").replace("idp","IDP").replace("covid","COVID").replace(" , ",", ").replace("staplec","staple") 

def count_number_of_questions_qname(questionnaire_file):
    questionnaire_df = pd.read_excel(pd.ExcelFile(questionnaire_file), sheet_name='survey',skiprows=2)
    questionnaire_df = questionnaire_df[['Q Name']]
    list_questions = questionnaire_df.dropna().values.tolist()
    list_questions = [item for sublist in list_questions for item in sublist]
    n_of_questions = len(list_questions)
    return list_questions, n_of_questions

def count_number_of_questions_sqname(questionnaire_file):
    questionnaire_df = pd.read_excel(pd.ExcelFile(questionnaire_file), sheet_name='survey',skiprows=2)
    questionnaire_df = questionnaire_df[['Suggested Qname']]
    list_questions = questionnaire_df.dropna().values.tolist()
    list_questions = [item for sublist in list_questions for item in sublist]
    n_of_questions = len(list_questions)
    return list_questions, n_of_questions
    

def read_questionnaire(input_questionnaire_file, writer):
    ##this section of the script reads the survey excel file and creates an excel file with multiple sheets:
    ### each sheet contains the coded value and description for a "Single choice" or "Open Ended-Select All That Apply" question.
    ###moreover, it creates and populates several lists that will be used later for defining each field of the final table names, types and domains

    #print("Opening questionnaire DF")
    quest_df = pd.read_excel(open(input_questionnaire_file, 'rb'), sheet_name='survey',skiprows=2)
    #create a list of all possible numbering
    numbering = ["%s)" % n for n in range(1,200)] ## 1), 2), ... 200)
    # initialize list of lists that will store the results
    dict_derived_fieldnames = {} #this dict will group all derived fields in case of "Select All That Apply" type of questions
    field_names_list = [] ##this list will contain all fields of the final table
    text_type_fields = [] ##this list will contain all fields of the final table with TEXT type
    range_type_fields = [] ##this list will contain all fields of the final table storing RANGE data  (will be LONG type)
    double_type_fields = [] ##this list will contain all fields of the final table storing DOUBLE data  
    ##iterate the following for each row (so each question of the questionnaire)
    all_derived_fieldnames = []
    all_answers_with_other_option = [] ##this list will contain all "Other: specify" fields
    quest_df = quest_df[quest_df['Suggested Qname'].notna()]
    for index, row in quest_df.iterrows():
        question_name = []
        question_type = []
        try:
            first_derived_fieldname = "" #the name of the first derived field will be the main of the domain table
            derived_fieldnames = []
            codes_and_labels = []
            categories = str(row['English']).replace("\t","")
            question_name = row['Suggested Qname'].strip()  #Q Name
            question_type = row['Q Type']
            programming_instructions = row['Programming Instructions'] #this field contains coded values for crop_main
            #print("\n\n----%s----" % question_name)
            #only for questions with pre-defined categories need domains
            if question_type in ("StartRecording","Single Choice","Open Ended-Single Choice", "Open Ended - Single Choice", "Open Ended-Select All That Apply",
                                 "Select All That Apply","Open Ended - Select All That Apply "):
                if question_name == 'crp_main': #for this question only, coded values should be taken from field programming_instructions
                    programming_lines = programming_instructions.splitlines()
                    for programming_line in programming_lines:
                        if ")" in programming_line:
                            #print(programming_line)
                            index, category = programming_line.split(")")
                            category = fix_category_formatting(category)
                            codes_and_labels.append([index, category])
                else:
                    #find all numbering present in the category string
                    numbering_in_text = [n for n in numbering if n in categories]
                    #print(numbering_in_text)
                    ##the following loop creates a list "codes_and_labels" with all available codes&labels for each question
                    for index in range(0,len(numbering_in_text)):
                        start = categories.find(numbering_in_text[index]) + len(numbering_in_text[index])
                        try:
                            end = categories.find(numbering_in_text[index + 1])
                            substring = categories[start:end].strip()
                        except:
                            # it fails during the last loop -> the last option is usually at the end of the string
                            substring = categories[start:].strip()
                        #print(substring)
                        category = fix_category_formatting(substring)
                        codes_and_labels.append([index +1, category])

                if question_type not in ["Open Ended-Select All That Apply","Select All That Apply","Open Ended - Select All That Apply "]:
                    #so questions with NO derived fields
                    field_names_list.append(question_name.strip())
                    codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                    codes_and_labels_df.to_excel(writer, sheet_name=question_name)
                else:
                    #so questions with derived fields
                    numbering_in_qname = [n for n in numbering if n in question_name]
                    for index in range(0, len(numbering_in_qname)):
                        start = question_name.find(numbering_in_qname[index]) + len(numbering_in_qname[index])
                        try:
                            end = question_name.find(numbering_in_qname[index + 1])
                            derived_field_name = question_name[start:end].strip()
                        except:
                            # it fails during the last loop -> the last option is usually at the end of the string
                            derived_field_name = question_name[start:].strip()
                        all_derived_fieldnames.append(derived_field_name)
                        derived_fieldnames.append(derived_field_name)
                        field_names_list.append(derived_field_name)
                        if derived_field_name[-6:] == "_other": ##this field will need to be STRING - with no domain (since it's a 'other specify')
                            all_answers_with_other_option.append(derived_field_name)
                        if index == 0:
                            first_derived_fieldname = derived_field_name
                            codes_and_labels_df = pd.DataFrame(codes_and_labels, columns=['code', 'label'])
                            #codes_and_labels_df.to_excel(writer, sheet_name=derived_field_name) #we don't need domain table for derived fields, since they will use YES NOT domain table
                    dict_derived_fieldnames[first_derived_fieldname] = all_derived_fieldnames
            elif question_type == "Range":
                #these questions will be associated to LONG type fields
                field_names_list.append(question_name.strip())
                range_type_fields.append(question_name)
            elif question_type == "Open Ended":
                #these questions will be associated to TEXT type fields
                if not pd.isnull(question_name): #NaN rows we want to skip (i.e. OptIn question without a name in the survey)
                    field_names_list.append(question_name)
                    text_type_fields.append(question_name)
            else:
                #print("QUESTION SKIPPED ---------", question_name, question_type)
                pass
        except Exception as e: 
            print("Failed question %s (type: %s) for error: %s " % (question_name, question_type,e))

    #adding Yes No table (for derived fields domain)
    d = {1:"Yes",0:"No"}
    yesno_df = pd.DataFrame(d.items(), columns=['code', 'label'])
    yesno_df.to_excel(writer, sheet_name='yes_no')
    #print("Saving codes and labels %s" % coded_values_file)

    #print(all_answers_with_other_option)

    #creating additional sheet with all derived fields, that will be used for a script that ensures that also these values are within the domains in the output table, in a later stage
    list_of_yes_no_fields = []
    for derived_field in all_derived_fieldnames:
        if derived_field not in all_answers_with_other_option:
            list_of_yes_no_fields.append(derived_field)

    derived_fields_df = pd.DataFrame(list_of_yes_no_fields) 
    derived_fields_df.to_excel(writer, sheet_name='derived_fields') 

    # Close the Pandas Excel writer and output the Excel file.
    writer.save()

In [231]:
print("Reading TEMPLATE questionnaire and creating recap excel file")
startTime = datetime.now()
now = datetime.now().strftime('%Y%m%d%H%M%S')
temp_path = r'C:\temp\data_processing_exports'
questionnaire_file = r'C:\git\hh_survey\household_questionnaire_geopoll_202105.xlsx' #the questionnaire file that we use for creating the table
coded_values_file = os.path.join(temp_path, "coded_values_%s_template.xlsx" % now) #intermediary output file with all categories and codes extracted from the questionnaire
writer = pd.ExcelWriter(coded_values_file, engine='xlsxwriter')
field_names_list = []
max_counter = 3000 #for testing purposes, we may need to limit the execution only to some items
read_questionnaire(questionnaire_file, writer)

print("Reading COUNTRY questionnaire and creating recap excel file")
country_questionnaire_file = r'C:\temp\household_questionnaire_geopoll_20210525_PH.xlsx' #the questionnaire file that we use for creating the table
country_coded_values_file = os.path.join(temp_path, "coded_values_%s_country.xlsx" % now) #intermediary output file with all categories and codes extracted from the questionnaire
country_writer = pd.ExcelWriter(country_coded_values_file, engine='xlsxwriter')
field_names_list = []
max_counter = 3000 #for testing purposes, we may need to limit the execution only to some items
read_questionnaire(country_questionnaire_file, country_writer)

Reading TEMPLATE questionnaire and creating recap excel file
Reading COUNTRY questionnaire and creating recap excel file


In [232]:
print ("COUNTING NUMBER OF QUESTIONS")

T_sqname_list_questions, T_sqname_n_of_questions = count_number_of_questions_sqname(questionnaire_file)
T_qname_list_questions, T_qname_n_of_questions = count_number_of_questions_qname(questionnaire_file)
C_sqname_list_questions, C_sqname_n_of_questions = count_number_of_questions_sqname(country_questionnaire_file)
C_qname_list_questions, C_qname_n_of_questions = count_number_of_questions_qname(country_questionnaire_file)

print("QName Field - Number of questions in the Template Questionnaire: %s" % T_qname_n_of_questions)
print("QName Field - Number of questions in the Country Questionnaire: %s\n" % C_qname_n_of_questions)
print("SuggestedQName Field - Number of questions in the Template Questionnaire: %s" % T_sqname_n_of_questions)
print("SuggestedQName Field - Number of questions in the Country Questionnaire: %s" % C_sqname_n_of_questions)

QName Field - Number of questions in the Template Questionnaire: 126
QName Field - Number of questions in the Country Questionnaire: 129

SuggestedQName Field - Number of questions in the Template Questionnaire: 124
SuggestedQName Field - Number of questions in the Country Questionnaire: 124


In [244]:
print ("DETECTING DIFFERENCES IN FIELD: Suggested QName")

print("\nQuestions in country questionnaire and not in template questionnaire: ",list(set(C_sqname_list_questions) - set(T_sqname_list_questions)))
print("Questions in template questionnaire and not in country questionnaire: ",list(set(T_sqname_list_questions) - set(C_sqname_list_questions)))

DETECTING DIFFERENCES IN FIELD: Suggested QName

Questions in country questionnaire and not in template questionnaire:  ['hh_wealth_water']
Questions in template questionnaire and not in country questionnaire:  ['hh_wealth_water\n\n1)hh_wealth_water_pvttap\n2)hh_wealth_water_publictap\n3)hh_wealth_water_protectwell\n4)hh_wealth_water_bottle\n5)hh_wealth_water_othersafe\n6)hh_wealth_water_river\n7)hh_wealth_water_unprotectwell\n8)hh_wealth_water_spring\n9)hh_wealth_water_canalsurface\n10)hh_wealth_water_otherunsafe\n11)hh_wealth_water_dk\n12)hh_wealth_water_ref']


In [245]:
print ("DETECTING DIFFERENCES IN FIELD: Q Name")

print("\nQuestions in country questionnaire and not in template questionnaire: ",list(set(C_qname_list_questions) - set(T_qname_list_questions)))
print("Questions in template questionnaire and not in country questionnaire: ",list(set(T_qname_list_questions) - set(C_qname_list_questions)))

DETECTING DIFFERENCES IN FIELD: Q Name

Questions in country questionnaire and not in template questionnaire:  ['crp_landdoc', 'hh_head']
Questions in template questionnaire and not in country questionnaire:  []


In [247]:
print ("CHECKING THAT ALL MANDATORY QUESTIONS ARE INCLUDED IN THE COUNTRY QUESTIONNAIRE")

quest_df = pd.read_excel(open(questionnaire_file, 'rb'), sheet_name='survey',skiprows=2)
#print(list(quest_df.columns.values))
quest_df.rename(columns={'Unnamed: 19': 'Mandatory'}, inplace=True)
list_mandatory_questions = quest_df.loc[quest_df["Mandatory"] == "yes"]["Q Name"].values.tolist() 
print ("\nFound", len(list_mandatory_questions), "mandatory questions in the template questionnaire")

result =  all(elem in C_qname_list_questions  for elem in list_mandatory_questions)
if result:
    print("Yes, country questionnaire contains all mandatory questions")    
else :
    print("No, country questionnaire does not contains all mandatory questions. Missing question: %s " % list(set(list_mandatory_questions ) - set(C_qname_list_questions)))

CHECKING THAT ALL MANDATORY QUESTIONS ARE INCLUDED IN THE COUNTRY QUESTIONNAIRE

Found 63 mandatory questions in the template questionnaire
Yes, country questionnaire contains all mandatory questions


In [248]:
print ("CHECKING THAT RULES ON CS QUESTIONS ARE RESPECTED\n")

cs_questions_in_country = [i for i in C_qname_list_questions if i.startswith('cs')]
if len(cs_questions_in_country) > 0:
    cs_stress_questions_in_country = [i for i in C_qname_list_questions if i.startswith('cs_stress')]
    cs_emergency_questions_in_country = [i for i in C_qname_list_questions if i.startswith('cs_emergency')]
    cs_crisis_questions_in_country = [i for i in C_qname_list_questions if i.startswith('cs_crisis')]
    print("%s CS questions found in the country questionnaire\n" % len(cs_questions_in_country))
    print ("There should be 3 questions for each group.")
    print("Number of STRESS questions: %s" % len(cs_stress_questions_in_country))
    print("Number of CRISIS questions: %s" % len(cs_crisis_questions_in_country))
    print("Number of EMERGENCY questions: %s" % len(cs_emergency_questions_in_country))
else:
    print("No CS questions in the country questionnaire")

CHECKING THAT RULES ON CS QUESTIONS ARE RESPECTED

17 CS questions found in the country questionnaire

There should be 3 questions for each group.
Number of STRESS questions: 6
Number of CRISIS questions: 6
Number of EMERGENCY questions: 5


In [251]:
print ("CHECKING THAT RULES ON HDDS QUESTIONS ARE RESPECTED")

hdds_questions_in_country = [i for i in C_qname_list_questions if i.startswith('hdds')]
print("\nThere should be either 0 or 2 HDDS questions")
if len(hdds_questions_in_country) > 0:
    print("Number of HDDS questions: %s  - %s" % (len(hdds_questions_in_country), hdds_questions_in_country))
else:
    print("No HDDS questions in the country questionnaire")

CHECKING THAT RULES ON HDDS QUESTIONS ARE RESPECTED

There should be either 0 or 2 HDDS questions
Number of HDDS questions: 2  - ['hdds', 'hdds_confirmation']


In [252]:
print("Comparing domains and derived fields: ")
all_domains_respected = True
for template_question, template_domains in dict_with_dfs_template.items():
    if template_question in dict_with_dfs_country:
        if template_question == 'derived_fields':
            print("Derived fields: ")
            derived_fields_country = list(dict_with_dfs_country[template_question][0].tolist())
            derived_fields_template = list(template_domains[0].tolist())
            derived_fields_only_country = list(set(derived_fields_country ) - set(derived_fields_template))
            derived_fields_only_template = list(set(derived_fields_template ) - set(derived_fields_country))
            if len(derived_fields_only_country) == 0 and len(derived_fields_only_template) == 0:
                print ("Derived fields have been respected")
            else:
                print ("Derived field only in template: %s" % derived_fields_only_template)
                print ("Derived field only in country: %s" % derived_fields_only_country)
        elif template_question == 'crp_main':
            pass ##crp domains are going to be edited by the script - no need to check the now#
        else:
            df_diff = pd.concat([template_domains,dict_with_dfs_country[template_question]]).drop_duplicates(keep=False)
            del df_diff['Unnamed: 0']
            if not df_diff.empty:
                print("-------")
                all_domains_respected = False
                print("Domains have been changed for question: ", template_question)
                #print("First row is the template domain, second row is the country domain")
                with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
                    print(df_diff)
            
            
if all_domains_respected == True:
    print ("All domains have been respected")

    

Comparing domains and derived fields: 
-------
Domains have been changed for question:  hh_education
   code                                 label
4     5  Islamic or other religious education
4     5                     Islamic education
-------
Domains have been changed for question:  hh_wealth_toilet
   code                              label
0     1  Flush latrine (toilet with water)
5     6                         Don't know
6     7                            Refused
0     1    Flush latrine/toilet with water
-------
Domains have been changed for question:  hh_wealth_light
   code       label
5     6  Don'tk now
6     7     Refused
-------
Domains have been changed for question:  income_sec
   code                                        label
0     1  Farmer, production and sale of cereal crops
0     1  Farmer, production and sale of staple crops
-------
Domains have been changed for question:  income_third
    code                                        label
0      1  Farmer, pr

In [239]:
# To add: find_and_replace_strings_in_df

In [240]:
# To add: sort_crop_list_by_selection

In [241]:
# To add: insert_sheet_with_adm2_reference

In [242]:
print("Execution time: ", datetime.now() - startTime)

Execution time:  0:00:01.807133
