# Table 1
- Py kernel

In [32]:
import pandas as pd
import numpy as np
import pingouin as pg
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy import stats

# Load Data

In [2]:
base = "/Users/brinkley97/Documents/development/lab-kcad/"
path_to_file = "datasets/tiles_dataset/table_1/"
name_of_file = "nurse_data.csv"
file = base + path_to_file + name_of_file

In [3]:
def load_data(file):
    original_data = pd.read_csv(file)
    # original_data = pd.DataFrame(file)
    copy_of_data = original_data.copy()
    return copy_of_data

In [4]:
nurse_df = load_data(file)
# nurse_df

In [5]:
day_shift = nurse_df['Shift'] == 'Day shift'
day_df = nurse_df.loc[day_shift]
# day_df.head(5)

In [6]:
night_shift = nurse_df['Shift'] == 'Night shift'
night_df = nurse_df.loc[night_shift]
# night_df.head(5)

# Load Generated Specific Questions

In [7]:
path_to_questions = "tiles-day-night/my_code"
name_of_questions = "generateSpecificQuestions.ipynb"
questions = base + path_to_questions + name_of_questions

In [8]:
# %load questions
/Users/brinkley97/Documents/development/lab-kcad/tiles-day-night/my_codegenerateSpecificQuestions.ipynb

In [9]:
%run "generateSpecificQuestions.ipynb"

In [10]:
demo_specific_questions = list(t_test_questions.values())[0]
# demo_specific_questions

# Run analysis

## Fisher Odds Test - Table 1

In [11]:
def fisher_odds_p_value(first_df, second_df, demo, demo_option1, demo_option2, print_col):
    tabel_df = pd.DataFrame(index=['day', 'night'], columns=[demo_option1, demo_option2])
    tabel_df.loc['day', demo_option1] = len(first_df.loc[first_df[demo] == demo_option1])
    tabel_df.loc['day', demo_option2] = len(first_df.loc[first_df[demo] == demo_option2])
    tabel_df.loc['night', demo_option1] = len(second_df.loc[second_df[demo] == demo_option1])
    tabel_df.loc['night', demo_option2] = len(second_df.loc[second_df[demo] == demo_option2])
    oddsratio, pvalue = stats.fisher_exact(np.array(tabel_df))
    # print(tabel_df)
    # print()
    
    return 'oddsratio : %.3f, pvalue: %.3f \n' % (oddsratio, pvalue)

In [12]:
def table_one_demo(demo_specific_questions):
    key_words = []
    p_values = []

    for demo_specific_question in demo_specific_questions:
        # print(demo_specific_question)
        demos = list(key_words_in_study["demographic"])
        # print(demos)
        demo_values = list(key_words_in_study.values())[2]
        # print("demo_values : ", demo_values)
        for demo in demos:
            # print(demo)
            if demo in demo_specific_question.split("*"):
                # print(demo, True)
                # if demo != "Age":
                demo_option1 = demo_values[demo][0]
                # print("demo_option1 : ", demo_option1)
                demo_option2 = demo_values[demo][1]
                # print("demo_option2 : ", demo_option2)
                p_value = fisher_odds_p_value(day_df, night_df, demo=demo, demo_option1=demo_option1, demo_option2=demo_option2, print_col=demo)
                # print(p_value)
                p_values.append(demo)
                p_values.append(p_value)
                # else:
                #     p_value = fisher_odds_p_value(day_df, night_df, demo="age", demo_option1='', demo_option2=demo_option2, print_col=demo)
                #     p_values.append(demo)
                #     p_values.append(p_value)
            else:
                continue
        # print("") 
    return p_values

In [13]:
table_one_demo(demo_specific_questions)

['Gender',
 'oddsratio : 1.188, pvalue: 0.829 \n',
 'Age',
 'oddsratio : 0.517, pvalue: 0.160 \n',
 'Educ',
 'oddsratio : 1.967, pvalue: 0.215 \n',
 'native_lang',
 'oddsratio : nan, pvalue: 1.000 \n']

In [14]:
behav_specific_questions = list(t_test_questions.values())[1]
behav_specific_questions

['what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *age* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *stai* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_PosAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_NegAffect* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *swls* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Neuroticism* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Conscientiousness* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Extraversion* ?',
 'what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *bfi_Agre

In [15]:
def ttest_p_value(all_df, first_df, second_df, demo, print_col, end=False, stat='num'):
    
    p_values = []
    result = pg.ttest(np.array(first_df[demo].dropna()), np.array(second_df[demo].dropna()))
    p_values.append(demo)
    
    p = result['p-val'].values[0]
    p_values.append(p)
    
    return p_values

In [39]:
def anova(igtb_df, col, factor1, factor2):
    data_df = igtb_df[[col]+['shift', factor1, factor2]]
    data_df = data_df.dropna()

    data_df[factor1] = pd.get_dummies(data_df[factor1], drop_first=True)
    data_df[factor2] = pd.get_dummies(data_df[factor2], drop_first=True)
    data_df['shift'] = pd.get_dummies(data_df['shift'], drop_first=True)

    print(col)
    lm = ols(col + ' ~ shift +' + factor1 + ' + ' + factor2, data = data_df).fit()
    
    table = sm.stats.anova_lm(lm, typ=2)  # Type 2 ANOVA DataFrame
    print(table)

    print()

In [47]:
def table_one_behav(behav_specific_questions):
    p_values = []
    behav_values = list(key_words_in_study.values())[3]
    # print("behav_values : ", behav_values)
    
    behavs = list(key_words_in_study["behavioral"])
    # print(behavs)
    
    for behav_specific_question in behav_specific_questions:
        print("\n", behav_specific_question)
        
        for behav in behavs:
            print("\nbehav : ", behav)
            
            if behav in behav_specific_question.split("*"):
                print("\nbehav : ", behav)
                p_value = ttest_p_value(nurse_df, day_df, night_df, behav, behav, end=False, stat='num')
                # print(p_value)
                p_values.append(p_value)
                behavs.remove(behav)
                # print("\n", behavs)
                if behav != "age":
                    anov = anova(nurse_df, col=behav, factor1='Age', factor2='Gender')
                    # print("anov: ", anov)
                    
                
    return p_values

In [48]:
table_one_behav(behav_specific_questions)


 what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *age* ?

behav :  age

behav :  pan_PosAffect

behav :  pan_NegAffect

behav :  swls

behav :  bfi_Neuroticism

behav :  bfi_Conscientiousness

behav :  bfi_Extraversion

behav :  bfi_Agreeableness

behav :  bfi_Openness

behav :  psqi

 what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *stai* ?

behav :  stai
stai
               sum_sq     df         F    PR(>F)
shift      107.480852    1.0  1.557112  0.214787
Age         59.308490    1.0  0.859222  0.356023
Gender       6.651424    1.0  0.096361  0.756839
Residual  7454.784921  108.0       NaN       NaN


behav :  pan_NegAffect

behav :  swls

behav :  bfi_Neuroticism

behav :  bfi_Conscientiousness

behav :  bfi_Extraversion

behav :  bfi_Agreeableness

behav :  bfi_Openness

behav :  psqi

 what are differences in primarily *day-shift* nurses and primarily *night-shift* nurses for *pan_PosAffect* ?

[['age', 0.06689772023038969],
 ['stai', 0.13029739484895286],
 ['pan_PosAffect', 0.7641569165374072],
 ['pan_NegAffect', 0.0649083906530571],
 ['swls', 0.030358697165230476],
 ['bfi_Neuroticism', 0.16565089264601718],
 ['bfi_Conscientiousness', 0.6011217803816151],
 ['bfi_Extraversion', 0.6000875364762028],
 ['bfi_Agreeableness', 0.9944573236985323],
 ['bfi_Openness', 0.0582800859451143],
 ['psqi', 0.0040333134698801254]]

## Integration with R

In [2]:
affect_cols = ['pan_NegAffect', 'swls', 'psqi']
affect_cols = ['stai', 'pan_PosAffect', 'pan_NegAffect', 'swls', 'bfi_Neuroticism', 'bfi_Conscientiousness', 'bfi_Extraversion', 'bfi_Agreeableness', 'bfi_Openness', 'psqi']

In [3]:
affect_cols

['stai',
 'pan_PosAffect',
 'pan_NegAffect',
 'swls',
 'bfi_Neuroticism',
 'bfi_Conscientiousness',
 'bfi_Extraversion',
 'bfi_Agreeableness',
 'bfi_Openness',
 'psqi']

In [None]:
def anova_behavioral(behav_specific_questions):
    
    