# Reproduce Table 1
- Py kernel
- Verify by looking at [Table 1 in the Original Analysis paper](https://www.nature.com/articles/s41598-021-87029-w?proof=t%25C2%25A0) and by running `python3 demographics_day_night.py` [Original version](https://github.com/usc-sail/tiles-day-night/blob/main/code/ground_truth/demographics_day_night.py). Be sure to configure your file paths.

In [1]:
import pandas as pd
import numpy as np
import pingouin as pg

from scipy import stats

# Load Data

In [2]:
path_to_file = "../data/tiles_datasets/table_1/nurse_data.csv"

In [3]:
def load_data(file):
    
    original_data = pd.read_csv(file)
    copy_of_data = original_data.copy()
    
    return copy_of_data

In [4]:
nurse_df = load_data(path_to_file)
# nurse_df

In [5]:
day_shift = nurse_df['Shift'] == 'Day shift'
day_df = nurse_df.loc[day_shift]
# day_df.head(5)

In [6]:
night_shift = nurse_df['Shift'] == 'Night shift'
night_df = nurse_df.loc[night_shift]
# night_df.head(5)

# Load Generated Specific Questions

In [7]:
%run "../generateSpecificQuestions.ipynb"

In [8]:
table_1_demographic_sqs

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Gender* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Age* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Educ* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *native_lang* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Female* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Male* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *< 40 Years* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *>= 40 Years* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Some college or College* ?',
 'what are di

In [9]:
table_1_demographic_sqs, ontology_mappings

(['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Gender* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Age* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Educ* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *native_lang* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Female* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Male* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *< 40 Years* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *>= 40 Years* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Some college or College* ?',
  'w

In [10]:
demographic_ontology_keys = list(ontology_mappings["demographic"])
demographic_ontology_values = list(ontology_mappings.values())[6]

# in demographic section, the line similiar to this next has keys and this one has values bc they are stored differently 
# in the ontology_mappings dict in generateSpecificQuestions.ipynb
behavioral_ontology_values = list(ontology_mappings["behavioral"])

# Run Analysis

## Fisher Odds Test - Table 1

In [11]:
def fisher_odds_p_value(day_shift_df, night_shift_df, specific_demographic_ontology_key, demo_group_1, demo_group_2):
    '''Calculate the fisher oddsratio and p-values for demographic ontologies using stats.fisher_exact() function
    
    Arguments:
    day_shift_df -- Pandas DataFrame
    night_shift_df -- Pandas DataFrame
    demo -- str which is used as a column in the both Pandas DataFrames 
    demo_group_1 -- str which is used as a column in the both Pandas DataFrames 
    demo_group_2 -- str which is used as a column in the both Pandas DataFrames 
    
    Return: 
    oddsratio and pvalue
    '''
    
    tabel_df = pd.DataFrame(index=['day', 'night'], columns=[demo_group_1, demo_group_2])
    tabel_df.loc['day', demo_group_1] = len(day_shift_df.loc[day_shift_df[specific_demographic_ontology_key] == demo_group_1])
    tabel_df.loc['day', demo_group_2] = len(day_shift_df.loc[day_shift_df[specific_demographic_ontology_key] == demo_group_2])
    tabel_df.loc['night', demo_group_1] = len(night_shift_df.loc[night_shift_df[specific_demographic_ontology_key] == demo_group_1])
    tabel_df.loc['night', demo_group_2] = len(night_shift_df.loc[night_shift_df[specific_demographic_ontology_key] == demo_group_2])
    oddsratio, pvalue = stats.fisher_exact(np.array(tabel_df))
    
    return 'oddsratio : %.3f, pvalue: %.3f \n' % (oddsratio, pvalue)

In [12]:
def table_one_demo_ontologies(demo_specific_questions, demographic_ontology_keys, demographic_ontology_values):
    '''Format data properly to pass into the fisher_odds_p_value() function
    
    Argument:
    demo_specific_questions -- py list
    demographic_ontology_keys -- py list
    demographic_ontology_values -- py list
    
    Functions:
    fisher_odds_p_value()
    
    Return:
    p-values from fisher_odds_p_value() function -- list
    '''
    
    store_p_values = []

    for demo_specific_question in demo_specific_questions:
        
        
        for specific_demographic_ontology_key in demographic_ontology_keys:
            
            if specific_demographic_ontology_key in demo_specific_question.split("*"):
                demo_group_1 = demographic_ontology_values[specific_demographic_ontology_key][0]
                demo_group_2 = demographic_ontology_values[specific_demographic_ontology_key][1]
               
                p_value = fisher_odds_p_value(day_df, night_df, specific_demographic_ontology_key=specific_demographic_ontology_key, demo_group_1=demo_group_1, demo_group_2=demo_group_2)
                store_p_values.append(specific_demographic_ontology_key)
                store_p_values.append(p_value)
                
            else:
                continue
 
    return store_p_values

In [13]:
table_one_demo_ontologies(table_1_demographic_sqs, demographic_ontology_keys, demographic_ontology_values)

['Gender',
 'oddsratio : 1.188, pvalue: 0.829 \n',
 'Age',
 'oddsratio : 0.517, pvalue: 0.160 \n',
 'Educ',
 'oddsratio : 1.967, pvalue: 0.215 \n',
 'native_lang',
 'oddsratio : nan, pvalue: 1.000 \n']

## TTest - Table 1

In [14]:
table_1_behavioral_sqs

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *stai* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_PosAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_NegAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *swls* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Neuroticism* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Conscientiousness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Extraversion* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Agreeableness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses

In [15]:
def ttest_p_value(day_df, night_df, specific_behavioral_ontology_value):
    '''Calculate the pingouin p-values for behavioral ontologies using the pg.ttest() function
    
    Arguments:
    day_df -- Pandas DataFrame
    night_df -- Pandas DataFrame
    specific_behavioral_ontology_value -- str which is used as a column in the both Pandas DataFrames 
    
    Return:
    p-values from pg.ttest() function -- list
    '''

    p_values = []
    result = pg.ttest(np.array(day_df[specific_behavioral_ontology_value].dropna()), np.array(night_df[specific_behavioral_ontology_value].dropna()))
    p_values.append(specific_behavioral_ontology_value)
    
    p_val_results = result['p-val'].values[0]
    p_values.append(p_val_results)
    
    return p_values

In [16]:
def table_one_behav(behav_specific_questions, behavioral_ontology_values):
    '''Format data properly to pass into the ttest_p_value() function
    
    Argument:
    behav_specific_questions -- list
    
    Functions:
    ttest_p_value()
    
    Return:
    p-values from ttest_p_value() -- list
    '''
    
    store_p_values = [] 

    for behav_specific_question in behav_specific_questions:
        
        for specific_behavioral_ontology_value in behavioral_ontology_values:
            
            if specific_behavioral_ontology_value in behav_specific_question.split("*"):
                
                p_value = ttest_p_value(day_df, night_df, specific_behavioral_ontology_value)
                store_p_values.append(p_value)
                behavioral_ontology_values.remove(specific_behavioral_ontology_value)
                
    return store_p_values

In [17]:
table_one_behav(table_1_behavioral_sqs, behavioral_ontology_values)

[['stai', 0.13029739484895286],
 ['pan_PosAffect', 0.7641569165374072],
 ['pan_NegAffect', 0.0649083906530571],
 ['swls', 0.030358697165230476],
 ['bfi_Neuroticism', 0.16565089264601718],
 ['bfi_Conscientiousness', 0.6011217803816151],
 ['bfi_Extraversion', 0.6000875364762028],
 ['bfi_Agreeableness', 0.9944573236985323],
 ['bfi_Openness', 0.0582800859451143],
 ['psqi', 0.0040333134698801254]]