# Reproduce Table 1
- Py kernel
- Verify by looking at [Table 1 in the Original Analysis paper](https://www.nature.com/articles/s41598-021-87029-w?proof=t%25C2%25A0) and by running `python3 demographics_day_night.py` [Original version](https://github.com/usc-sail/tiles-day-night/blob/main/code/ground_truth/demographics_day_night.py). Be sure to configure your file paths.

In [1]:
import pandas as pd
import numpy as np
import pingouin as pg

from scipy import stats

# Parameters

In [28]:
path_to_demographic_day_file : str = "../synthetic_data/tiles_datasets/table_1_synthetic_data/day_table_1_synthetic_demographic_data.csv"
path_to_demographic_night_file : str = "../synthetic_data/tiles_datasets/table_1_synthetic_data/night_table_1_synthetic_demographic_data.csv"
path_to_behaviorial_day_file : str = "../synthetic_data/tiles_datasets/table_1_synthetic_data/day_table_1_synthetic_behavioral_data.csv"
path_to_behaviorial_night_file : str = "../synthetic_data/tiles_datasets/table_1_synthetic_data/night_table_1_synthetic_behavioral_data.csv"
output_csv : str = "../output.csv"
output_json : str = "../output.json"
specific_questions_notebook : str = "../generateSpecificQuestions.ipynb"
target_variable : str = "bfi_Neuroticism"

# Load Data

In [3]:
def load_data(file):
    
    original_data = pd.read_csv(file)
    copy_of_data = original_data.copy()
    
    return copy_of_data

In [4]:
demo_day_df = load_data(path_to_demographic_day_file)
# demo_day_df

In [5]:
demo_night_df = load_data(path_to_demographic_night_file)
# demo_night_df

In [6]:
behav_day_df = load_data(path_to_behaviorial_day_file)
# behav_day_df

In [7]:
behav_night_df = load_data(path_to_behaviorial_night_file)
# behav_night_df

# Load Generated Specific Questions

In [8]:
%run $specific_questions_notebook

In [9]:
table_1_demographic_sqs

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Gender* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Age* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Educ* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *native_lang* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Female* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Male* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *< 40 Years* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *>= 40 Years* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Some college or College* ?',
 'what are di

In [10]:
table_1_behavioral_sqs

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *stai* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_PosAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_NegAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *swls* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Neuroticism* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Conscientiousness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Extraversion* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Agreeableness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses

In [11]:
# ontology_mappings

In [12]:
demographic_ontology_keys = list(ontology_mappings["demographic"])
demographic_ontology_values = list(ontology_mappings.values())[6]

# in demographic section, the line similiar to this next has keys and this one has values bc they are stored differently 
# in the ontology_mappings dict in generateSpecificQuestions.ipynb
behavioral_ontology_values = list(ontology_mappings["behavioral"])

In [13]:
demographic_ontology_values

{'Gender': ['Female', 'Male'],
 'Age': ['< 40 Years', '>= 40 Years'],
 'Educ': ['Some college or College', 'Graduate'],
 'native_lang': ['English', 'non-english']}

In [14]:
behavioral_ontology_values

['stai',
 'pan_PosAffect',
 'pan_NegAffect',
 'swls',
 'bfi_Neuroticism',
 'bfi_Conscientiousness',
 'bfi_Extraversion',
 'bfi_Agreeableness',
 'bfi_Openness',
 'psqi']

# Run Analysis

## Fisher Odds Test - Table 1

In [15]:
def fisher_odds_p_value(day_shift_df, night_shift_df, specific_demographic_ontology_key, demo_group_1, demo_group_2):
    '''Calculate the fisher oddsratio and p-values for demographic ontologies using stats.fisher_exact() function
    
    Arguments:
    day_shift_df -- Pandas DataFrame
    night_shift_df -- Pandas DataFrame
    demo -- str which is used as a column in the both Pandas DataFrames 
    demo_group_1 -- str which is used as a column in the both Pandas DataFrames 
    demo_group_2 -- str which is used as a column in the both Pandas DataFrames 
    
    Return: 
    oddsratio and pvalue
    '''
    
    table_df = pd.DataFrame(index=['day', 'night'], columns=[demo_group_1, demo_group_2])
    table_df.loc['day', demo_group_1] = len(day_shift_df.loc[day_shift_df[specific_demographic_ontology_key] == demo_group_1])
    table_df.loc['day', demo_group_2] = len(day_shift_df.loc[day_shift_df[specific_demographic_ontology_key] == demo_group_2])
    table_df.loc['night', demo_group_1] = len(night_shift_df.loc[night_shift_df[specific_demographic_ontology_key] == demo_group_1])
    table_df.loc['night', demo_group_2] = len(night_shift_df.loc[night_shift_df[specific_demographic_ontology_key] == demo_group_2])
    
    oddsratio, pvalue = stats.fisher_exact(np.array(table_df))
    
    return 'oddsratio : %.3f, pvalue: %.3f \n' % (oddsratio, pvalue)

In [16]:
def table_one_demo_ontologies(demo_specific_questions, demographic_ontology_keys, demographic_ontology_values, day_df, night_df, target_variable):
    '''Format data properly to pass into the fisher_odds_p_value() function
    
    Argument:
    demo_specific_questions -- py list
    demographic_ontology_keys -- py list
    demographic_ontology_values -- py list
    
    Functions:
    fisher_odds_p_value()
    
    Return:
    p-values from fisher_odds_p_value() function -- list
    '''
    
    store_p_values = []

    for demo_specific_question in demo_specific_questions:
        
        for specific_demographic_ontology_key in demographic_ontology_keys:
            
            if specific_demographic_ontology_key in demo_specific_question.split("*"):
                if specific_demographic_ontology_key == target_variable:
                    print(specific_demographic_ontology_key)
                    demo_group_1 = demographic_ontology_values[specific_demographic_ontology_key][0]
                    demo_group_2 = demographic_ontology_values[specific_demographic_ontology_key][1]

                    p_value = fisher_odds_p_value(day_df, night_df, specific_demographic_ontology_key=specific_demographic_ontology_key, demo_group_1=demo_group_1, demo_group_2=demo_group_2)
                    store_p_values.append(specific_demographic_ontology_key)
                    store_p_values.append(p_value)
                
            else:
                continue
 
    return store_p_values

In [17]:
table_one_demo_ontologies(table_1_demographic_sqs, demographic_ontology_keys, demographic_ontology_values, demo_day_df, demo_night_df, target_variable)

[]

## TTest - Table 1

In [18]:
def ttest_p_value(day_df, night_df, specific_behavioral_ontology_value):
    '''Calculate the pingouin p-values for behavioral ontologies using the pg.ttest() function
    
    Arguments:
    day_df -- Pandas DataFrame
    night_df -- Pandas DataFrame
    specific_behavioral_ontology_value -- str which is used as a column in the both Pandas DataFrames 
    
    Return:
    p-values from pg.ttest() function -- list
    '''
    
    p_values = []
    result = pg.ttest(np.array(day_df[specific_behavioral_ontology_value].dropna()), np.array(night_df[specific_behavioral_ontology_value].dropna()))
    p_val_results = result['p-val'].values[0]    
    return p_val_results

In [19]:
def table_one_behav(behav_specific_questions, behavioral_ontology_values, day_df, night_df, target_variable):
    '''Format data properly to pass into the ttest_p_value() function
    
    Argument:
    behav_specific_questions -- list
    
    Functions:
    ttest_p_value()
    
    Return:
    p-values from ttest_p_value() -- list
    '''
    
    store_p_values = {} 

    for behav_specific_question in behav_specific_questions:
        
        for specific_behavioral_ontology_value in behavioral_ontology_values:
            if specific_behavioral_ontology_value in behav_specific_question.split("*"):
                if specific_behavioral_ontology_value == target_variable:
                    print(specific_behavioral_ontology_value)

                    p_value = ttest_p_value(day_df, night_df, specific_behavioral_ontology_value)
                    store_p_values[specific_behavioral_ontology_value] = p_value
                    behavioral_ontology_values.remove(specific_behavioral_ontology_value)
                
    return store_p_values

In [20]:
results = table_one_behav(table_1_behavioral_sqs, behavioral_ontology_values, behav_day_df, behav_night_df, target_variable)

bfi_Neuroticism


In [21]:
results

{'bfi_Neuroticism': 0.1934542158101793}

In [31]:
import csv

def dict_to_csv(data, filename, target_variable):
    update_filename = filename[:-10] + target_variable + '_' + filename[3:]
    print(update_filename)
    keys = data.keys()
    values = data.values()
    print(keys)
    with open(update_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(keys)
        writer.writerow(values)

    print(f"CSV file '{update_filename}' has been created successfully.")

In [32]:
dict_to_csv(results, output_csv, target_variable)

../bfi_Neuroticism_output.csv
dict_keys(['bfi_Neuroticism'])
CSV file '../bfi_Neuroticism_output.csv' has been created successfully.


In [33]:
import json

def dict_to_json(data, filename, target_variable):
    update_filename = filename[:-11] + target_variable + '_' + filename[3:]
    print(update_filename)
    with open(update_filename, 'w') as file:
        json.dump(data, file)

    print(f"JSON file '{update_filename}' has been created successfully.")

In [34]:
dict_to_json(results, output_json, target_variable)

../bfi_Neuroticism_output.json
JSON file '../bfi_Neuroticism_output.json' has been created successfully.
