In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import pprint
import json

In [2]:
data = pd.read_csv("output_data.csv")
data.head()

Unnamed: 0,Record ID,Sex,Race/Ethnicity (choice=White (Caucasian)),Tobacco Usage,Marijuana Use,Weight (kg):,How many cm tall are you?,"1) As of present, and looking back over the past two weeks how would you describe your general well being?",2) Over the course of 2 weeks how many of these days did you experience obstructive pain?,"3) If none to question 2, have you had any obstructive pain over the past 2 months and for how many days?",...,"14) Did you experience any nausea over the last 2 weeks? If so, how many days of the past two weeks?","15) Did you experience any nausea over the last 2 months? If so, how many days of the past two months?",16) Out of the days where you experienced nausea how severe would you say it was on average over the past two weeks?,17) Out of the days where you experienced nausea how severe would you say it was on average over the past 2 months?,"18) Did you experience any vomiting over the last 2 weeks? If so, how many days of the past two weeks?","19) Did you experience any vomiting over the last 2 months? If so, how many days of the past two months?",20) Out of the days where you experienced vomiting how severe would you say it was on average over the past two weeks?,21) Out of the days where you experienced vomiting how severe would you say it was on average over the past two months?,22) Have you ever been hospitalized due to the above symptoms mentioned?,Patient Group
0,3,Female,Checked,Past,Never,55.8,164.0,Poor,Between 4 and 7 days,,...,11-14 days,4-7 days,Moderate,Moderate,,,No vomiting at all,No vomiting at all,Yes,Definitive Stricture
1,4,Male,Checked,Never,Never,82.0,180.0,Slightly Below Par,< 4 days,,...,,,No nausea at all,No nausea at all,,,No vomiting at all,No vomiting at all,No,Probably Stricture
2,5,Female,Checked,Never,Never,78.0,164.0,Slightly Below Par,Between 4 and 7 days,,...,1-3 days,1-3 days,Mild,Mild,,,No vomiting at all,No vomiting at all,No,Probably Stricture
3,6,Male,Unchecked,Active,,,,Generally Well,,,...,,,No nausea at all,No nausea at all,,,No vomiting at all,No vomiting at all,No,Definitive Stricture
4,7,Male,Checked,,,,,Slightly Below Par,,,...,,1-3 days,No nausea at all,Moderate,,1-3 days,Moderate,Moderate,No,Probably Stricture


In [3]:
######### DO not touch below ###

## Functions to label encode, store and load mapping, and filter for specific symptoms.

# get a list of all questions related to symptoms

def symptom_to_questions(data, symptom_list,ambiguous_symptoms ):
    all_columns = data.columns
    symptom_questions = {}
    for symptom in symptom_list:

        symptom_questions[symptom] = []

        for col in all_columns:

            if symptom in col:
                 symptom_questions[symptom].append(col)


    # get a list for all questions related to the symptoms listed above.
    all_symptom_questions =['Patient Group']
    for symptom in symptom_questions.keys():
        all_symptom_questions  = all_symptom_questions + symptom_questions[symptom]


    # address ambigous cases
    for symptom in ambiguous_symptoms.keys():

        ambiguous_idx = [i for i, x in enumerate(symptom_questions[symptom]) if ambiguous_symptoms[symptom] in x]

        new_symptom = f"{ambiguous_symptoms[symptom]}_{symptom}"
        ambiguous_questions = {new_symptom:[]}
        for idx in ambiguous_idx:
            ambiguous_questions[new_symptom].append(symptom_questions[symptom][idx])


        for question in  list( ambiguous_questions.values())[0]:
            try:
                symptom_questions[symptom].remove(question)

            except:
                print(symptom)
                print(f"Question {question} is not in symptom list")
                


        symptom_questions = symptom_questions | ambiguous_questions


    return symptom_questions   





def mapping_per_symp(symptom_questions, symptom):
    """
    takes in the value from symptom_to_questions function
    """
    mapping ={}
    for col in symptom_questions[symptom]:

        label_encoder = LabelEncoder()
        label_encoder.fit(data[col])

        class_to_label = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        for key in class_to_label.keys():
            class_to_label[key] = int(class_to_label[key])

        mapping[col] = class_to_label

    return mapping



def encode_sym_questions(data, symptom_list,ambiguous_symptoms ):
    encoded_data = data.copy()

# change missing values nan to text nan for encoding and labeling
    encoded_data = encoded_data.fillna('NaN')

    symptom_questions = symptom_to_questions(encoded_data, symptom_list, ambiguous_symptoms )
    

    encode_all_symp_questions = {}
    for symp in symptom_questions.keys():
        mapping = mapping_per_symp(symptom_questions, symp)
        encode_all_symp_questions = encode_all_symp_questions | mapping
        
    return encode_all_symp_questions, encoded_data, symptom_questions


def save__new_mapping(encode_all_symp_questions, file_name = 'symptom_label_mapping.json'):
    # save mapping
    with open(file_name, 'w') as fp:
        json.dump(encode_all_symp_questions, fp)
        
        
        
def load_mapping(file_name = 'symptom_label_mapping.json'):

    # load saved mapping
    with open(file_name) as f:
        mapping = json.load(f)
        
    return mapping
    
    
    
    
def encode_data(encoded_data, mapping):
    for col in mapping.keys():
        encoded_data[col] = encoded_data[col].apply( lambda x: mapping[col][x] )
    return encoded_data



        
def get_related_questions(encoded_data, symptom_questions):
        
    all_related_questions = ['Patient Group']

    for question_set in symptom_questions.values():
        all_related_questions = all_related_questions + question_set


    encoded_data = encoded_data[all_related_questions]
    
    return encoded_data



def save_to_csv(data, file_loc_name):
    data.to_csv(file_loc_name,index=False)

In [4]:
symptom_list = ['nausea', 'pain','bloat','vomit']

ambiguous_symptoms ={
    'pain': 'obstructive'
    # ,'my_symptom': 'ambiguous_symptom'
                }     

data = data

encode_all_symp_questions, encoded_data, symptom_questions = encode_sym_questions(data, symptom_list,ambiguous_symptoms )

mapping = load_mapping()

encoded_data = encode_data(encoded_data, mapping)

encoded_data = get_related_questions(encoded_data, symptom_questions)


encoded_data
# save_to_csv(encoded_data, 'data/encoded_data.csv')

Unnamed: 0,Patient Group,"14) Did you experience any nausea over the last 2 weeks? If so, how many days of the past two weeks?","15) Did you experience any nausea over the last 2 months? If so, how many days of the past two months?",16) Out of the days where you experienced nausea how severe would you say it was on average over the past two weeks?,17) Out of the days where you experienced nausea how severe would you say it was on average over the past 2 months?,"13) Over the course of 2 weeks from now, did you experience increasing pain after a meal?","14) Over the course of 2 months from now, did you experience increasing pain after a meal?","6) In the past 2 weeks did you experience abdominal bloating and if so, how severe?","7) In the past 2 months did you experience abdominal bloating and if so, how severe?","18) Did you experience any vomiting over the last 2 weeks? If so, how many days of the past two weeks?","19) Did you experience any vomiting over the last 2 months? If so, how many days of the past two months?",20) Out of the days where you experienced vomiting how severe would you say it was on average over the past two weeks?,21) Out of the days where you experienced vomiting how severe would you say it was on average over the past two months?,2) Over the course of 2 weeks how many of these days did you experience obstructive pain?,"3) If none to question 2, have you had any obstructive pain over the past 2 months and for how many days?",4) On average how severe was your obstructive pain over the last two weeks?,5) On average how severe was your obstructive pain over the last two months?
0,Definitive Stricture,1,2,1,1,1,1,1,1,1,1,2,2,1,3,1,3
1,Probably Stricture,3,3,2,2,2,2,0,2,1,1,2,2,0,3,0,0
2,Probably Stricture,0,0,0,0,2,2,0,0,1,1,2,2,1,3,0,0
3,Definitive Stricture,3,3,2,2,2,2,2,2,1,1,2,2,4,3,3,3
4,Probably Stricture,3,0,2,1,3,3,2,1,1,0,1,1,4,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,Inflammatory,3,3,2,2,2,2,1,1,1,1,2,2,4,3,3,3
70,Definitive Stricture,3,3,2,2,0,0,3,3,1,1,2,2,4,3,3,3
71,Probably Stricture,0,3,1,2,2,2,1,1,1,1,2,2,0,1,0,0
72,Definitive Stricture,3,0,2,2,2,2,2,2,1,1,2,2,2,0,1,1
