In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

def generate_new_data(df):
    # Load and preprocess data
    df = pd.read_csv('source_data/diabetes_012_health_indicators_BRFSS2015_py_cp.csv')
    df = df.dropna()
    X_train, X_test, y_train, y_test = train_test_split(df[df.columns], df['BMI'], test_size=100)

    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate the model
    score = model.score(X_test, y_test)
    print('Model score:', score)

    # Synthesize data using the fitted model
    y_pred = model.predict(X_test)
    df_synth = pd.DataFrame({'Diabetes_012': X_test['Diabetes_012'],'BMI_pred': y_pred})
    # Compare the synthesized data to the original data
    print('Original data correlation:', df['Diabetes_012'].corr(df['BMI']))
    print('Synthesized data correlation:', df_synth['Diabetes_012'].corr(df_synth['BMI_pred']))
    return df_synth



In [2]:
import statistics
import pandas as pd
import os

df = pd.read_csv("source_data/diabetes_012_health_indicators_BRFSS2015_py_cp.csv")
gen_data = generate_new_data(df)
if "gen_data.csv" not in os.listdir():
    gen_data.to_csv("gen_data.csv")
    gen_data = pd.read_csv("gen_data.csv")
else:
    gen_data = pd.read_csv("gen_data.csv")
df = pd.concat([df,],axis=0,ignore_index=True)

"""
Original data correlation: 0.22437947375839748
Synthesized data correlation: 0.250064562325651
"""

Model score: 1.0
Original data correlation: 0.22437947375839748
Synthesized data correlation: 0.443882578058235


'\nOriginal data correlation: 0.22437947375839748\nSynthesized data correlation: 0.250064562325651\n'

In [None]:
overall_proportion = pd.DataFrame(df[['Diabetes_012',"BMI"]].value_counts(normalize=True))
overall_proportion.sort_values(by="BMI").to_csv("overall_proportion.csv")
print(f"Overall Proportion Difference between smallest and largest: {0.0844489120151371-3.94197414064963E-06}")


Overall Proportion Difference between smallest and largest: 0.08444497004099645


In [None]:
from sklearn.model_selection import train_test_split
# Desired sample size
sample_size = 500
sample_size_per_percentile = int(500 / 5)
if "sample_data.csv" not in os.listdir():

    #df['BMI_percentile'] = pd.cut(df['BMI'], bins=3, labels=False)
    # Randomly sample from each percentile
    #sample = df.groupby('BMI_percentile').apply(lambda x: x.sample(n=sample_size_per_percentile))
    sample_columns = ["Diabetes_012","HighBP","HighChol","BMI","Smoker","Stroke","HeartDiseaseorAttack","PhysActivity","Age"]
    sample, _ = train_test_split(df, train_size=sample_size, stratify=df['Age'], random_state=42)
    
    sample[sample_columns].to_csv("sample_data.csv")
    sample500 = pd.read_csv("sample_data.csv")
else:
    sample500 = pd.read_csv("sample_data.csv")
print(sample500[["BMI","Diabetes_012","Age"]]a.value_counts(normalize=True).sort_values().to_csv("Age_percentile_proportion.csv"))
no_diabetes = sample500[sample500["Diabetes_012"] == 0]
diabetes = sample500[sample500["Diabetes_012"] == 1]
diabetes_bmi = diabetes["BMI"]
no_diabetes_bmi = no_diabetes["BMI"]
diabetes_variance = diabetes["BMI"].var()
no_diabetes_variance = no_diabetes["BMI"].var()
print(diabetes_variance,no_diabetes_variance)

In [None]:
f_stat = diabetes_variance / no_diabetes_variance
df1 = len(diabetes) - 1 
df2 = len(no_diabetes) - 1 
print(f_stat,df1,df2)

In [None]:
from scipy.stats import f
p_value_one_tail = 1 - f.cdf(f_stat, df1, df2)

# For a two-tailed test
p_value_two_tail = 2 * min(p_value_one_tail, 1 - p_value_one_tail)

print(p_value_two_tail)
print("""
This extremely small p-value suggests that the observed F-statistic is highly unlikely under the null hypothesis that the variances are equal.
Conclusion: Reject the null hypothesis (H0) and conclude that the variances are significantly different at any conventional significance level (e.g., 
a=0.05.
""")

In [None]:
from scipy.stats import ttest_ind
"""
So we do Welch's T-test because the variances are different. Indicated by the F-test on the variance.
"""
t_stat, p_value = ttest_ind(diabetes_bmi, no_diabetes_bmi, equal_var=False)
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# Interpretation
alpha = 0.025
if p_value < alpha:
    print("Reject the null hypothesis: The means are significantly different.")
else:
    print("Fail to reject the null hypothesis: No significant difference in means.")

In [None]:
"""
Meaning of Rejecting 
H0: Rejecting means that there is sufficient statistical evidence to conclude that the number of people with diabetes varies with BMI. 
In other words:

BMI is significantly associated with the prevalence of diabetes.

This could imply that as BMI changes (e.g., increases or decreases), the number of people with diabetes also changes.
Interpretation in Practical Terms:
Causal Relationship:

Rejecting H0
Does not prove causation but suggests a strong association between BMI and diabetes.
Further studies might be needed to explore causality.
Public Health Implications:

If the relationship is strong and significant, this finding could motivate interventions targeting BMI to control diabetes prevalence.
"""

In [None]:
"""Next Steps After Rejecting H0

Quantify the Relationship:
Use regression analysis to model the relationship between BMI and diabetes prevalence.
Study Other Factors:
Consider whether other factors (e.g., age, genetics, lifestyle) also influence the observed relationship.
Policy Recommendations:
Develop programs to manage BMI as a potential strategy to reduce diabetes prevalence.
Would you like help interpreting specific data or running an analysis?
"""

In [None]:
import pandas as pd

df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015_py_cp.csv")
df_over_9 = df.query('Age > 9')
df_under_9 = df.query('Age < 9')
mean_over_9 = df_over_9["BMI"].mean()
mean_under_9 = df_under_9["BMI"].mean()
mean_difference = mean_under_9 - mean_over_9
print(mean_difference,mean_under_9,mean_over_9)