#  Correlation with Linear Regression for the Diabetes Treatment Dataset

In [None]:
import seaborn as sns
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
warnings.simplefilter("ignore")

In [None]:
# read in the data from a website in CSV format using Pandas
df = pd.read_csv("diabetes_treatment.csv")
df.head()

In [None]:
print(df.info()) # print out the columns and their types and number in reries format

In [None]:
df.describe()

## Plotting HbA1C Followup v HbA1C Baseline

In [None]:
sns.scatterplot(df['HbA1c.Baseline'], df['HbA1c.Followup']);

## Looking at the Correlations Between HbA1C Followup and HbA1C Baseline

In [None]:
[col for col in df.columns]

In [None]:
features = ['HbA1c.Baseline','HbA1c.Followup']

In [None]:
HbA1C_df = df[features]
HbA1C_df.head()

In [None]:
sns.pairplot(HbA1C_df)

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(HbA1C_df.corr(),annot=True)
plt.plot()

In [None]:
# Calculate the Pearson Correlation Coefficient between all features

corr_df = HbA1C_df.corr(method="pearson") 
display(corr_df)

sns.heatmap(corr_df, cmap=sns.diverging_palette(220, 10, as_cmap=True),
            xticklabels=corr_df.columns.values,
            yticklabels=corr_df.columns.values)

In [None]:
X = df['HbA1c.Baseline']
Y = df['HbA1c.Followup']

In [None]:
import scipy
r, p = scipy.stats.pearsonr(X, Y)
# r - correlation coeficient, p = p-value
print(r,p)

In [None]:
result = scipy.stats.linregress(X, Y)
print('reg-coeff = ', result.slope)
print('corr-rvalue = ', result.rvalue)
print('stderror = ', result.stderr)
print('pvalue = ', result.pvalue)

## Looking at the Correlations using simple model of Linear Regression

In [None]:
features = ['HbA1c.Baseline']
label = ['HbA1c.Followup']
X = df[features]
Y = df[label]

In [None]:
import statsmodels.api as sm
from scipy import stats

https://www.statsmodels.org/stable/regression.html

https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html

In [None]:
X_new = sm.add_constant(X)
regcorr = sm.OLS(Y, X_new)
stats = regcorr.fit()
print(stats.summary())

## Correlation with an extra feature using simple Linear Regression model

In [None]:
[col for col in df.columns]

In [None]:
df["Gender"] = df["Female"].astype(int)
df.head()

In [None]:
features = ['HbA1c.Baseline','Gender']
label = ['HbA1c.Followup']
X = df[features]
Y = df[label]

In [None]:
X_new = sm.add_constant(X)
regcorr = sm.OLS(Y, X_new)
stats = regcorr.fit()
print(stats.summary())

## Corrlelations with Multivariate Features using a simple Linear Regression model

In [None]:
df.head()

In [None]:
features = ['HbA1c.Baseline','Gender','Age','BMI','SBP']
label = ['HbA1c.Followup']
X = df[features]
Y = df[label]

In [None]:
X_new = sm.add_constant(X)
regcorr = sm.OLS(Y, X_new)
stats = regcorr.fit()
print(stats.summary())