In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stat
import math

In [333]:
FHS_data = pd.read_excel('Framingham_data.xlsx')
FHS_data.head()
len(FHS_data.index)

4699

In [344]:
def proportion_confidence_interval(col: str, CHD: int, gender: int, confidence=0.95):
    data = pd.DataFrame(columns=[col, 'chdfate'])
    data[[col, 'chdfate']] = FHS_data[[col, 'chdfate']].where(FHS_data['chdfate'] == CHD).dropna()
    filtered_data = data.where(data['sex'] == gender)[col].dropna()
    
    n = len(FHS_data['sex'])
    print(f'Total recorded datapoints for {col} (n): {n}')
    proportion = len(filtered_data.index) / n
    q = 1 - proportion
    print(f'p = {proportion}, q = {q}')
    # Standard error calculated via sigma / sqrt(n)
    std_dev = np.sqrt((proportion * q) / n)
#     print(data.sem())
    print(f'Standard deviation of proportion: {std_dev}')
    z_score = stat.norm.ppf(((1 + confidence) / 2))
#     print(stat.norm.ppf(.975))
    print(f'z-score: {z_score}')
    lower = proportion - (std_dev * z_score)
    upper = proportion + (std_dev * z_score)
    return lower, proportion, upper

In [345]:
def mean_confidence_interval(col: str, CHD: int, confidence=0.95):
    data = pd.DataFrame(columns=[col, 'chdfate'])
    data[[col, 'chdfate']] = FHS_data[[col, 'chdfate']]
    filtered_data = data.where(data['chdfate'] == CHD)[col].dropna()
    
    n = len(data.index)
    print(f'Total recorded datapoints for {col} (n): {n}')
    mean = filtered_data.mean()
    print(f'Mean: {mean}')
    # Standard error calculated via sigma / sqrt(n)
    std_error = filtered_data.std() / np.sqrt(n)
#     print(data.sem())
    print(f'Standard error of the mean: {std_error}')
    t_score = stat.t.ppf(((1 + confidence) / 2), df=(n-1))
#     print(stat.norm.ppf(.975))
    print(f't-score: {t_score}')
    lower = mean - (std_error * t_score)
    upper = mean + (std_error * t_score)
    return lower, mean, upper

In [346]:
def mean_hypothesis_test():

SyntaxError: unexpected EOF while parsing (<ipython-input-346-c456c83887c3>, line 1)

In [347]:
def proportion_hypothesis_test():

SyntaxError: unexpected EOF while parsing (<ipython-input-347-035b2877abd7>, line 1)

## Problem 1

### Part A

In [349]:
interval_women_has_CHD = proportion_confidence_interval(col='sex', CHD=1, gender=2)
print(f'\nWe are 95% confident that the population proportion (proportion of US population) of women diagosed with CHD')
print(f'falls within the interval of {interval_women_has_CHD[0]:.4f} to {interval_women_has_CHD[-1]:.4f}.')

Total recorded datapoints for sex (n): 4699
p = 0.1383273036816344, q = 0.8616726963183656
Standard deviation of proportion: 0.005036425326323434
z-score: 1.959963984540054

We are 95% confident that the population proportion (proportion of US population) of women diagosed with CHD
falls within the interval of 0.1285 to 0.1482.


In [350]:
interval_men_has_CHD = proportion_confidence_interval(col='sex', CHD=1, gender=1)
print(f'\nWe are 95% confident that the population proportion (proportion of US potppulation) of men diagosed with CHD')
print(f'falls within the interval of {interval_men_has_CHD[0]:.4f} to {interval_men_has_CHD[-1]:.4f}.')

Total recorded datapoints for sex (n): 4699
p = 0.17514364758459247, q = 0.8248563524154076
Standard deviation of proportion: 0.005544771253336928
z-score: 1.959963984540054

We are 95% confident that the population proportion (proportion of US population) of men diagosed with CHD
falls within the interval of 0.1643 to 0.1860.


### Part B

## Problem 2

In [175]:
# Isolate the two variables we want (DBP and CHD)
DBP_CHD_data = pd.DataFrame(FHS_data[['dbp', 'chdfate']])

### Part A

In [273]:
interval_DBP_no_CHD = mean_confidence_interval(col='dbp', CHD=0)
print(f'\nWe are 95% confident that the population mean of diastolic blood pressure (dbp) for individuals NOT diagnosed with CHD')
print(f'falls within the interval of {interval_DBP_no_CHD[0]:.2f} mmHg to {interval_DBP_no_CHD[-1]:.2f} mmHg.')

Total recorded datapoints for dbp (n): 4699
Mean: 81.03099814011159
Standard error of the mean: 0.17840415335534265
t-score: 1.9604690658796426

We are 95% confident that the population mean of diastolic blood pressure (dbp) for individuals NOT diagnosed with CHD
falls within the interval of 80.68 mmHg to 81.38 mmHg.


In [274]:
interval_DBP_CHD = mean_confidence_interval(col='dbp', CHD=1)
print(f'\nWe are 95% confident that the population mean of diastolic blood pressure (dbp) for individuals diagnosed with CHD')
print(f'falls within the interval of {interval_DBP_CHD[0]:.2f} mmHg to {interval_DBP_CHD[-1]:.2f} mmHg.')

Total recorded datapoints for dbp (n): 4699
Mean: 85.8499660556687
Standard error of the mean: 0.19252644536384675
t-score: 1.9604690658796426

We are 95% confident that the population mean of diastolic blood pressure (dbp) for individuals diagnosed with CHD
falls within the interval of 85.47 mmHg to 86.23 mmHg.


### Part B

## Problem 3

### Part A

In [277]:
interval_SCL_no_CHD = mean_confidence_interval(col='scl', CHD=0)
print(f'\nWe are 95% confident that the population mean of systolic blood pressure (scl) for individuals NOT diagnosed with CHD')
print(f'falls within the interval of {interval_SCL_no_CHD[0]:.2f} mmHg to {interval_SCL_no_CHD[-1]:.2f} mmHg.')

Total recorded datapoints for scl (n): 4699
Mean: 223.0009375
Standard error of the mean: 0.618649016831971
t-score: 1.9604690658796426

We are 95% confident that the population mean of systolic blood pressure (scl) for individuals NOT diagnosed with CHD
falls within the interval of 221.79 mmHg to 224.21 mmHg.


### Part B

In [278]:
interval_SCL_has_CHD = mean_confidence_interval(col='scl', CHD=1)
print(f'\nWe are 95% confident that the population mean of systolic blood pressure (scl) for individuals NOT diagnosed with CHD')
print(f'falls within the interval of {interval_SCL_has_CHD[0]:.2f} mmHg to {interval_SCL_has_CHD[-1]:.2f} mmHg.')

Total recorded datapoints for scl (n): 4699
Mean: 239.8431105047749
Standard error of the mean: 0.6836238714998996
t-score: 1.9604690658796426

We are 95% confident that the population mean of systolic blood pressure (scl) for individuals NOT diagnosed with CHD
falls within the interval of 238.50 mmHg to 241.18 mmHg.
