# Reproduce Table 1
- Py kernel
- Verify by looking at [Table 1 in the Original Analysis paper](https://www.nature.com/articles/s41598-021-87029-w?proof=t%25C2%25A0) and by running `python3 demographics_day_night.py` [Original version](https://github.com/usc-sail/tiles-day-night/blob/main/code/ground_truth/demographics_day_night.py). Be sure to configure your file paths.

In [1]:
import pandas as pd
import numpy as np
import pingouin as pg

from scipy import stats

# Load Data

In [2]:
path_to_demographic_day_file = "../data/tiles_datasets/table_1_synthetic_data/day_table_1_synthetic_demographic_data.csv"
path_to_demographic_night_file = "../data/tiles_datasets/table_1_synthetic_data/night_table_1_synthetic_demographic_data.csv"

path_to_behaviorial_day_file = "../data/tiles_datasets/table_1_synthetic_data/day_table_1_synthetic_behavioral_data.csv"
path_to_behaviorial_night_file = "../data/tiles_datasets/table_1_synthetic_data/night_table_1_synthetic_behavioral_data.csv"

In [3]:
def load_data(file):
    
    original_data = pd.read_csv(file)
    copy_of_data = original_data.copy()
    
    return copy_of_data

In [4]:
demo_day_df = load_data(path_to_demographic_day_file)
demo_day_df

Unnamed: 0,Gender,Shift,Age,Educ,native_lang
0,Female,Day shift,< 40 Years,Graduate,Yes
1,Female,Day shift,< 40 Years,Graduate,Yes
2,Female,Day shift,< 40 Years,Graduate,Yes
3,Female,Day shift,< 40 Years,Graduate,Yes
4,Female,Day shift,< 40 Years,Graduate,Yes
...,...,...,...,...,...
64,Female,Day shift,< 40 Years,Graduate,Yes
65,Female,Day shift,< 40 Years,Graduate,Yes
66,Female,Day shift,< 40 Years,Graduate,Yes
67,Female,Day shift,< 40 Years,Graduate,Yes


In [5]:
demo_night_df = load_data(path_to_demographic_night_file)
demo_night_df

Unnamed: 0,Gender,Shift,Age,Educ,native_lang
0,Male,Night shift,>= 40 Years,Some college or College,No
1,Male,Night shift,>= 40 Years,Some college or College,No
2,Male,Night shift,>= 40 Years,Some college or College,No
3,Male,Night shift,>= 40 Years,Some college or College,No
4,Male,Night shift,>= 40 Years,Some college or College,No
5,Male,Night shift,>= 40 Years,Some college or College,No
6,Male,Night shift,>= 40 Years,Some college or College,No
7,Male,Night shift,>= 40 Years,Some college or College,No
8,Male,Night shift,>= 40 Years,Some college or College,No
9,Male,Night shift,>= 40 Years,Some college or College,No


In [6]:
behav_day_df = load_data(path_to_behaviorial_day_file)
behav_day_df

Unnamed: 0,stai,pan_PosAffect,pan_NegAffect,swls,bfi_Neuroticism,bfi_Conscientiousness,bfi_Extraversion,bfi_Agreeableness,bfi_Openness,psqi
0,41.452762,30.576441,18.211018,6.380579,1.949203,4.192199,1.667048,3.416390,3.462566,5.020155
1,29.452762,37.576441,20.211018,6.180579,0.949203,3.942199,1.917048,3.499723,4.545899,2.020155
2,32.452762,38.576441,23.211018,2.180579,1.865869,4.025533,2.167048,3.749723,3.379233,4.020155
3,33.452762,33.576441,19.211018,5.580579,2.032536,3.025533,2.750381,1.666390,4.045899,7.020155
4,24.452762,43.576441,16.211018,5.580579,1.032536,2.942199,3.583714,3.499723,3.712566,4.020155
...,...,...,...,...,...,...,...,...,...,...
64,43.452762,14.576441,17.211018,7.380579,2.282536,3.275533,1.333714,3.416390,1.545899,8.020155
65,23.452762,36.576441,16.211018,7.380579,1.282536,4.192199,2.583714,3.916390,3.879233,3.020155
66,26.452762,48.576441,22.211018,7.380579,0.782536,4.108866,2.667048,3.666390,3.462566,5.020155
67,42.452762,35.576441,21.211018,4.980579,2.199203,1.858866,2.583714,3.166390,3.295899,7.020155


In [7]:
behav_night_df = load_data(path_to_behaviorial_night_file)
behav_night_df

Unnamed: 0,stai,pan_PosAffect,pan_NegAffect,swls,bfi_Neuroticism,bfi_Conscientiousness,bfi_Extraversion,bfi_Agreeableness,bfi_Openness,psqi
0,26.851446,52.88419,27.189939,6.171653,0.765521,3.27176,5.399209,4.163626,2.884359,7.228901
1,57.851446,38.88419,37.189939,5.371653,2.348854,2.938426,2.815875,3.413626,2.717693,9.228901
2,40.851446,48.88419,19.189939,3.971653,0.765521,4.52176,5.065875,4.413626,3.217693,4.228901
3,22.851446,55.88419,17.189939,7.771653,0.515521,4.52176,5.649209,4.74696,3.467693,7.228901
4,26.851446,46.88419,20.189939,2.571653,0.848854,3.855093,4.649209,4.663626,3.551026,7.228901
5,26.851446,43.88419,19.189939,6.771653,0.932187,3.105093,4.232542,4.163626,2.717693,7.228901
6,29.851446,39.88419,19.189939,6.771653,0.765521,4.105093,4.565875,4.580293,2.717693,8.228901
7,49.851446,33.88419,29.189939,3.971653,1.848854,3.02176,3.482542,4.163626,2.134359,12.228901
8,41.851446,34.88419,25.189939,5.971653,1.515521,4.188426,3.815875,3.663626,1.967693,13.228901
9,35.851446,47.88419,22.189939,6.371653,1.182187,3.355093,4.565875,4.080293,3.301026,9.228901


# Load Generated Specific Questions

In [8]:
%run "../generateSpecificQuestions.ipynb"

In [9]:
table_1_demographic_sqs

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Gender* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Age* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Educ* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *native_lang* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Female* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Male* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *< 40 Years* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *>= 40 Years* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Some college or College* ?',
 'what are di

In [10]:
table_1_demographic_sqs, ontology_mappings

(['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Gender* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Age* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Educ* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *native_lang* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Female* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Male* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *< 40 Years* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *>= 40 Years* ?',
  'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *Some college or College* ?',
  'w

In [11]:
demographic_ontology_keys = list(ontology_mappings["demographic"])
demographic_ontology_values = list(ontology_mappings.values())[6]

# in demographic section, the line similiar to this next has keys and this one has values bc they are stored differently 
# in the ontology_mappings dict in generateSpecificQuestions.ipynb
behavioral_ontology_values = list(ontology_mappings["behavioral"])

# Run Analysis

## Fisher Odds Test - Table 1

In [12]:
def fisher_odds_p_value(day_shift_df, night_shift_df, specific_demographic_ontology_key, demo_group_1, demo_group_2):
    '''Calculate the fisher oddsratio and p-values for demographic ontologies using stats.fisher_exact() function
    
    Arguments:
    day_shift_df -- Pandas DataFrame
    night_shift_df -- Pandas DataFrame
    demo -- str which is used as a column in the both Pandas DataFrames 
    demo_group_1 -- str which is used as a column in the both Pandas DataFrames 
    demo_group_2 -- str which is used as a column in the both Pandas DataFrames 
    
    Return: 
    oddsratio and pvalue
    '''
    
    table_df = pd.DataFrame(index=['day', 'night'], columns=[demo_group_1, demo_group_2])
    table_df.loc['day', demo_group_1] = len(day_shift_df.loc[day_shift_df[specific_demographic_ontology_key] == demo_group_1])
    table_df.loc['day', demo_group_2] = len(day_shift_df.loc[day_shift_df[specific_demographic_ontology_key] == demo_group_2])
    table_df.loc['night', demo_group_1] = len(night_shift_df.loc[night_shift_df[specific_demographic_ontology_key] == demo_group_1])
    table_df.loc['night', demo_group_2] = len(night_shift_df.loc[night_shift_df[specific_demographic_ontology_key] == demo_group_2])
    
    oddsratio, pvalue = stats.fisher_exact(np.array(table_df))
    
    return 'oddsratio : %.3f, pvalue: %.3f \n' % (oddsratio, pvalue)

In [13]:
def table_one_demo_ontologies(demo_specific_questions, demographic_ontology_keys, demographic_ontology_values, day_df, night_df):
    '''Format data properly to pass into the fisher_odds_p_value() function
    
    Argument:
    demo_specific_questions -- py list
    demographic_ontology_keys -- py list
    demographic_ontology_values -- py list
    
    Functions:
    fisher_odds_p_value()
    
    Return:
    p-values from fisher_odds_p_value() function -- list
    '''
    
    store_p_values = []

    for demo_specific_question in demo_specific_questions:
        
        
        for specific_demographic_ontology_key in demographic_ontology_keys:
            
            if specific_demographic_ontology_key in demo_specific_question.split("*"):
                demo_group_1 = demographic_ontology_values[specific_demographic_ontology_key][0]
                demo_group_2 = demographic_ontology_values[specific_demographic_ontology_key][1]
               
                p_value = fisher_odds_p_value(day_df, night_df, specific_demographic_ontology_key=specific_demographic_ontology_key, demo_group_1=demo_group_1, demo_group_2=demo_group_2)
                store_p_values.append(specific_demographic_ontology_key)
                store_p_values.append(p_value)
                
            else:
                continue
 
    return store_p_values

In [14]:
table_one_demo_ontologies(table_1_demographic_sqs, demographic_ontology_keys, demographic_ontology_values, demo_day_df, demo_night_df)

['Gender',
 'oddsratio : inf, pvalue: 0.000 \n',
 'Age',
 'oddsratio : inf, pvalue: 0.000 \n',
 'Educ',
 'oddsratio : 0.000, pvalue: 0.000 \n',
 'native_lang',
 'oddsratio : nan, pvalue: 1.000 \n']

## TTest - Table 1

In [15]:
table_1_behavioral_sqs

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *stai* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_PosAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_NegAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *swls* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Neuroticism* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Conscientiousness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Extraversion* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Agreeableness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses

In [16]:
def ttest_p_value(day_df, night_df, specific_behavioral_ontology_value):
    '''Calculate the pingouin p-values for behavioral ontologies using the pg.ttest() function
    
    Arguments:
    day_df -- Pandas DataFrame
    night_df -- Pandas DataFrame
    specific_behavioral_ontology_value -- str which is used as a column in the both Pandas DataFrames 
    
    Return:
    p-values from pg.ttest() function -- list
    '''
    
    p_values = []
    result = pg.ttest(np.array(day_df[specific_behavioral_ontology_value].dropna()), np.array(night_df[specific_behavioral_ontology_value].dropna()))
    p_values.append(specific_behavioral_ontology_value)
    
    p_val_results = result['p-val'].values[0]
    p_values.append(p_val_results)
    
    return p_values

In [17]:
def table_one_behav(behav_specific_questions, behavioral_ontology_values, day_df, night_df):
    '''Format data properly to pass into the ttest_p_value() function
    
    Argument:
    behav_specific_questions -- list
    
    Functions:
    ttest_p_value()
    
    Return:
    p-values from ttest_p_value() -- list
    '''
    
    store_p_values = [] 

    for behav_specific_question in behav_specific_questions:
        
        for specific_behavioral_ontology_value in behavioral_ontology_values:
            
            if specific_behavioral_ontology_value in behav_specific_question.split("*"):
                
                p_value = ttest_p_value(day_df, night_df, specific_behavioral_ontology_value)
                store_p_values.append(p_value)
                behavioral_ontology_values.remove(specific_behavioral_ontology_value)
                
    return store_p_values

In [18]:
table_one_behav(table_1_behavioral_sqs, behavioral_ontology_values, behav_day_df, behav_night_df)

[['stai', 0.020308341516518114],
 ['pan_PosAffect', 3.7205324990342146e-08],
 ['pan_NegAffect', 9.511573995076765e-05],
 ['swls', 0.5061522953572187],
 ['bfi_Neuroticism', 0.1934542158101793],
 ['bfi_Conscientiousness', 0.5803018463984015],
 ['bfi_Extraversion', 2.203683011689185e-19],
 ['bfi_Agreeableness', 6.863540699203524e-19],
 ['bfi_Openness', 1.4469393051328919e-06],
 ['psqi', 3.4536974136942344e-11]]