In [3]:
import math

import numpy as np
import pandas as pd
import statsmodels as sm
import statsmodels.stats.weightstats

In [111]:
class Assari2019Baseline:
    DEFAULT_PAPER_ATTRIBUTES = {
        'id': 'assari2019baseline',
        'length_pages': 15,
        'authors': ['Shervin Assari', 'Mohsen Bazargan'],
        'journal': 'International Journal of Environmental Research and Public Health',
        'year': 2019,
        'current_citations': 9, #number of citations the paper has or how many people have cited it?
        'base_dataframe_pickle': 'assari2019ability_dataframe.pickle'
    }

    RACE_MAP = {
        1: "White",
        2: "Black"
    }

    GENDER_MAP = {
        1: "Man",
        2: "Woman"
    }

    FILENAME = 'assari2019baseline'

    COLUMN_MAP =  {"V2102": "Race", "V103": "Gender", "V2000": "Age", "V2007": "Education", "V2020": "Income", "V2637": "Smoking", "V2623": "BMI", "V2681": "HTN", "V13214": "Exercise", "V2203": "Depressive symptoms", "V915": "Health", "V1860": "Weight", "V15003": "Response pattern", "V836": "Stroke wave 1", "V4838": "Stroke wave 2", "V10225": "Stroke wave 3", "V12305": "Stroke wave 4", "V15944": "Stroke wave 5", "V12302": "Any stroke"}

    corr_df = None
    means = None
    dead = None

    def __init__(self):
        self.dataframe = self._recreate_dataframe()

    def _get_any_stroke_if_died(self, x):
        response_pattern = str(x["Response pattern"])
        if "4" not in response_pattern:
            return 0 # patient did not die
        for i in range(5):
            if x[f"Stroke wave {i + 1}"] == 1:
                return 1
        return 0
    
    def _recreate_dataframe(self, filename='assari2019baseline_dataframe.pickle'):
        data = pd.read_csv('data/DS0001/04690-0001-Data.tsv', sep='\t')

        data = data[self.COLUMN_MAP.keys()]
        data.rename(columns=self.COLUMN_MAP, inplace=True)

        data = data[(data["Race"] == 1) | (data["Race"] == 2)] # 1 = white, 2 = Black
        data["Educational attainment"] = data.apply(lambda x: 1 if x["Education"] >= 12 else 0, axis=1)
        data["Obesity"] = data.apply(lambda x: 1 if x["BMI"] > 30 else 0, axis=1)
        data["Health binary"] = data.apply(lambda x: 1 if x["Health"] in [1, 2, 3] else 0, axis=1)
        data["Death to cerebrovascular disease"] = data.apply(lambda x: self._get_any_stroke_if_died(x), axis=1)
        data.drop(columns=['Stroke wave 1', 'Stroke wave 2', 'Stroke wave 3', 'Stroke wave 4', 'Stroke wave 5','Response pattern', 'Any stroke'], inplace=True)

        data.to_pickle(filename)
        return data
    
    def get_corr(self):
        if self.corr_df is None:
            corr_df = self.dataframe[['Race', 'Age', 'Gender', 'Education', 'Income', 'Smoking', 'Exercise', 'Depressive symptoms', 'HTN', 'Obesity', 'Death to cerebrovascular disease']]
            self.corr_df = corr_df.corr()
        return self.corr_df
    
    def get_race_pools_with_means(self):
        if self.means is None:
            black_pool = self.dataframe.loc[self.dataframe['Race'] == 2]
            white_pool = self.dataframe.loc[self.dataframe['Race'] == 1]

            black_pool_means, white_pool_means = self._get_adjusted_means(black_pool), self._get_adjusted_means(white_pool)
            means = pd.concat([black_pool_means, white_pool_means])
            means['Race'] = ['Black', 'White']
            means.set_index('Race', inplace=True)

            self.means = means
        return self.means
    
    def _get_adjusted_means(self, data_sample):
        temp_means = np.around(sm.stats.weightstats.DescrStatsW(data_sample, weights=data_sample['Weight']).mean, 4)
        return pd.DataFrame(data=[temp_means], columns=data_sample.columns)
    
    def get_dead(self):
        if self.dead is None:
            self.dead = self.dataframe.loc[self.dataframe['Death to cerebrovascular disease'] == 1]
        return self.dead

    def finding_5_1(self):
        """Blacks were younger, had higher number of chronic medical conditions at baseline in comparison to Whites."""
        means = self.get_race_pools_with_means()
        soft_finding = means['Age']['Black'] < means['Age']['White'] and means['HTN']['Black'] > means['HTN']['White']
        return soft_finding

    def finding_5_2(self):
        """Relative to White people, Black individuals had also lower educational attainment (p < 0.05 for all)."""
        means = self.get_race_pools_with_means()
        soft_finding = means['Education']['Black'] < means['Education']['White']
        return soft_finding

    def finding_5_3(self):
        """Blacks also reported worse self-rated health (SRH) than Whites (Table 1)."""
        means = self.get_race_pools_with_means()
        soft_finding = means['Health']['Black'] > means['Health']['White'] # note 1 = excellent, 5 = poor
        return soft_finding

    def finding_5_4(self):
        """The overall prevalence of DM was 5.73%, (95%CI = 4.80-6.82)."""
        pass
    
    def finding_5_5(self):
        """DM was more common in Blacks (9.22%, 95%CI = 7.75-10.95) than Whites (5.25%, 95%CI = 4.2.4-6.50)."""
        pass

    def finding_5_6(self):
        """Similarly, overall, people had 12.53 years of schooling at baseline (95%CI = 12.34-12.73)."""
        means = self._get_adjusted_means(self.dataframe)
        soft_finding = round(means['Education'][0], 2) == 12.53
        return soft_finding

    def finding_5_7(self):
        """A comparison of racial groups showed higher educational attainment in Whites (12.69, 95%CI=12.48-12.90) than Blacks (11.37,95%CI = 10.90-11.84). Thus, on average, Whites had more than 1.3 years higher years [sic] of schooling than Blacks..."""
        means = self.get_race_pools_with_means()
        soft_finding = means['Education']['White'] > means['Education']['Black'] + 1.3
        return soft_finding

    def finding_5_8(self):
        """Of the 177 that died, 121 were White (68.36%) and 56 were Black (31.64%)."""
        dead = self.get_dead()
        total = dead.shape[0]
        black_count = dead.loc[dead['Race'] == 2].shape[0]
        white_count = dead.loc[dead['Race'] == 1].shape[0]
        soft_finding = total == 177 and white_count == 121 and black_count == 56
        return soft_finding

    def finding_5_9(self):
        """Of the 177 that died, 33 were obese (18.64%) and 144 were not obese (81.36%) at baseline."""    
        dead = self.get_dead()
        total = dead.shape[0]
        obese_count = dead.loc[dead['Obesity'] == 1].shape[0]
        not_obese_count = dead.loc[dead['Obesity'] == 0].shape[0]
        soft_finding = total == 177 and obese_count == 33 and not_obese_count == 144
        return soft_finding

    def finding_6_1(self):
        """In bivariate association, race was not associated with death due to cerebrovascular (unadjusted HR for Blacks compared to Whites = 0.78, 95% CI = 0.55-1.11), suggesting that Whites and Blacks had similar risk of future cerebrovascular mortality over 25 years."""
        corr_df = self.get_corr()
        soft_finding = abs(corr_df['Race'].loc['Death to cerebrovascular disease']) < 0.05
        return soft_finding

    def finding_6_2(self):
        """In bivariate association, baseline obesity was not associated with future risk of cerebrovascular mortality (Unadjusted HR for Blacks compared to Whites = 0.84, 95% CI = 0.45-1.56), suggesting that Whites and Blacks had a similar risk of future cerebrovascular mortality over 25 years."""
        corr_df = self.get_corr()
        soft_finding = abs(corr_df['Obesity'].loc['Death to cerebrovascular disease']) < 0.05
        return soft_finding

    # TODO: check that race correlation is for Black
    def finding_6_3(self):
        """Race (Black) was negatively associated with education and income"""
        corr_df = self.get_corr()
        soft_finding = corr_df['Race'].loc['Education'] < 0 and corr_df['Race'].loc['Income'] < 0
        return soft_finding

    # TODO: check that race correlation is for Black
    def finding_6_4(self):
        """[race (Black) was]... positively associated with depressive symptoms, hypertension, and obesity."""
        corr_df = self.get_corr()
        soft_finding = corr_df['Race'].loc['Depressive symptoms'] > 0 and corr_df['Race'].loc['HTN'] > 0 and corr_df['Race'].loc['Obesity'] > 0
        return soft_finding

    # TODO: check that race correlation is for Black
    def finding_6_5(self):
        """Blacks more frequently smoked and less frequently exercised.""" # implies positive correlation with smoking and negative with exercise
        corr_df = self.get_corr()
        soft_finding = corr_df['Race'].loc['Smoking'] > 0 and corr_df['Race'].loc['Exercise'] < 0
        return soft_finding

    # TODO: check that race correlation is for Black
    def finding_6_6(self):
        """Race was not associated with cerebrovascular death.""" # same as finding_6_1?
        corr_df = self.get_corr()
        soft_finding = abs(corr_df['Race'].loc['Death to cerebrovascular disease']) < 0.05
        return soft_finding

    # TODO: check that gender correlation is for female
    def finding_6_7(self):
        """Baseline obesity was associated with female gender and less education, income, smoking, and exercise."""
        corr_df = self.get_corr()
        soft_finding = corr_df['Obesity'].loc['Gender'] > 0 and corr_df['Obesity'].loc['Education'] < 0 and corr_df['Obesity'].loc['Income'] < 0 and corr_df['Obesity'].loc['Smoking'] < 0 and corr_df['Obesity'].loc['Exercise'] < 0
        return soft_finding

    def finding_6_8(self):
        """Obesity at baseline was associated with depressive symptoms and hypertension at baseline."""
        corr_df = self.get_corr()
        soft_finding = corr_df['Obesity'].loc['Depressive symptoms'] > 0 and corr_df['Obesity'].loc['HTN'] > 0
        return soft_finding
    
    def finding_6_9(self):
        """Obesity at baseline was not associated with cerebrovascular death in the pooled sample (Table 2).""" # same as finding_6_2?
        corr_df = self.get_corr()
        soft_finding = abs(corr_df['Obesity'].loc['Death to cerebrovascular disease']) < 0.05
        return soft_finding

    def finding_6_10(self):
        """According to Model 1 in the pooled sample, baseline obesity did not predict cerebrovascular mortality (HR = 0.86, 0.49-1.51), independent of demographic, socioeconomic, health behaviors, and health factors at baseline."""
        pass

    def finding_6_11(self):
        """According to Model 2, race interacted with baseline obesity on outcome (HR = 3.17, 1.09-9.21), suggesting a stronger association between baseline obesity and future risk for cerebrovascular deaths for Blacks, in comparison to Whites (Table 3)."""
        pass

    def finding_6_12(self):
        """As Model 3 shows, obesity did not predict the outcome in Whites (HR = 0.69, 0.31-1.53)."""
        pass

    def finding_6_13(self):
        """Model 4 shows that obesity predicts risk of cerebrovascular mortality for Blacks (HR = 2.51, 1.43-4.39) (Table 4)."""
        pass

In [112]:
test = Assari2019Baseline()
# failed: 6_6, 6_9, 5_8, 5_9

In [114]:
test.corr_df

Unnamed: 0,Race,Age,Gender,Education,Income,Smoking,Exercise,Depressive symptoms,HTN,Obesity,Death to cerebrovascular disease
Race,1.0,-0.058398,0.051344,-0.264512,-0.286588,0.055734,-0.17561,-0.155352,0.084485,0.135792,0.004274
Age,-0.058398,1.0,0.108127,-0.387396,-0.308426,-0.213817,-0.42233,-0.24595,0.507932,0.017712,0.173127
Gender,0.051344,0.108127,1.0,-0.057066,-0.196086,-0.068141,0.004381,-0.155666,0.136775,0.077081,-0.021183
Education,-0.264512,-0.387396,-0.057066,1.0,0.523071,0.001297,0.336711,0.283073,-0.363098,-0.125635,-0.124938
Income,-0.286588,-0.308426,-0.196086,0.523071,1.0,-0.016068,0.343362,0.273791,-0.315807,-0.095488,-0.10066
Smoking,0.055734,-0.213817,-0.068141,0.001297,-0.016068,1.0,-0.019964,-0.053355,-0.105658,-0.052126,-0.020175
Exercise,-0.17561,-0.42233,0.004381,0.336711,0.343362,-0.019964,1.0,0.242355,-0.31892,-0.007201,-0.172113
Depressive symptoms,-0.155352,-0.24595,-0.155666,0.283073,0.273791,-0.053355,0.242355,1.0,-0.283244,-0.119387,-0.121523
HTN,0.084485,0.507932,0.136775,-0.363098,-0.315807,-0.105658,-0.31892,-0.283244,1.0,0.18529,0.199507
Obesity,0.135792,0.017712,0.077081,-0.125635,-0.095488,-0.052126,-0.007201,-0.119387,0.18529,1.0,0.040384


In [119]:
corr_df = test.dataframe[['Race', 'Age', 'Gender', 'Education', 'Income', 'Smoking', 'Exercise', 'Depressive symptoms', 'HTN', 'Obesity', 'Death to cerebrovascular disease']]
corr_df['Race'] = corr_df.apply(lambda x: x['Race'] - 1, axis=1)
corr_df['Gender'] = corr_df.apply(lambda x: x['Gender'] - 1, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_df['Race'] = corr_df.apply(lambda x: x['Race'] - 1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_df['Gender'] = corr_df.apply(lambda x: x['Gender'] - 1, axis=1)


In [120]:
corr_df.corr()

Unnamed: 0,Race,Age,Gender,Education,Income,Smoking,Exercise,Depressive symptoms,HTN,Obesity,Death to cerebrovascular disease
Race,1.0,-0.058398,0.051344,-0.264512,-0.286588,0.055734,-0.17561,-0.155352,0.084485,0.135792,0.004274
Age,-0.058398,1.0,0.108127,-0.387396,-0.308426,-0.213817,-0.42233,-0.24595,0.507932,0.017712,0.173127
Gender,0.051344,0.108127,1.0,-0.057066,-0.196086,-0.068141,0.004381,-0.155666,0.136775,0.077081,-0.021183
Education,-0.264512,-0.387396,-0.057066,1.0,0.523071,0.001297,0.336711,0.283073,-0.363098,-0.125635,-0.124938
Income,-0.286588,-0.308426,-0.196086,0.523071,1.0,-0.016068,0.343362,0.273791,-0.315807,-0.095488,-0.10066
Smoking,0.055734,-0.213817,-0.068141,0.001297,-0.016068,1.0,-0.019964,-0.053355,-0.105658,-0.052126,-0.020175
Exercise,-0.17561,-0.42233,0.004381,0.336711,0.343362,-0.019964,1.0,0.242355,-0.31892,-0.007201,-0.172113
Depressive symptoms,-0.155352,-0.24595,-0.155666,0.283073,0.273791,-0.053355,0.242355,1.0,-0.283244,-0.119387,-0.121523
HTN,0.084485,0.507932,0.136775,-0.363098,-0.315807,-0.105658,-0.31892,-0.283244,1.0,0.18529,0.199507
Obesity,0.135792,0.017712,0.077081,-0.125635,-0.095488,-0.052126,-0.007201,-0.119387,0.18529,1.0,0.040384
