#  Linear Regression, ANOVA & Collinearity for the Diabetes Dataset

### In this video we will be relating HbA1c followup to all other features and discussing collinearity

In [None]:
import seaborn as sns
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
warnings.simplefilter("ignore")

In [None]:
# read in the data from a website in CSV format using Pandas

df = pd.read_csv("diabetes_treatment.csv")
df.head()

In [None]:
print(df.info()) # print out the columns and their types and number in reries format

In [None]:
df.describe()

## Creating categories and formatting the dataset

In [None]:
# chaning the name for these features to avoid a naming convention later

df['HbA1c_Followup'] = df['HbA1c.Followup']
df['HbA1c_Baseline'] = df['HbA1c.Baseline']
df.head()

In [None]:
# creating a binary column for Gender

df["Gender"] = df["Female"].astype(int)
df.head()

## Creating categorical variables from continuous data

### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html
### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html

In [None]:
# example of using qcut to segment or categorize a data column

pd.Series(pd.qcut(range(12), 3)).value_counts()

In [None]:
# separating into 3 categories

pd.qcut(df['BMI'], q=3).value_counts()

In [None]:
# creating BMI categories

df['BMI_Category'] = pd.qcut(df['BMI'], q=3, precision = 0)
df.head()

In [None]:
# using cut to create BMI categories

df['BMI_Category'] = pd.cut(df['BMI'], bins=3)
df.head()

In [None]:
# selecting the cut ranges for consistency with the R video

cat_bins =[0.0, 30.0, 35.0, 40.0]
df['BMI_Category'] = pd.cut(df['BMI'], bins=cat_bins)
df.head()

## ANOVA test on categorial BMI and other features

### There are many ways to do ANOVA testing in Python - here is one way similar to R

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

In [None]:
# In this summary of followup against all other features we see that the p-values for the BMI_categories
# have values greater than 0.05 which implies no significance.
# They therefore do not significantly predict the HbA1c.followup level or output when controlling for 
# or adjusting for other input features. 
# In this case prediction means that these categories do not explain the variance in the HbA1c.followup level.
#NOte - BMI categories do not include the reference BMI category 0-30 range. 

stats=ols('HbA1c_Followup ~ HbA1c_Baseline + Age + Gender + BMI_Category + SBP', data=df).fit()
print(stats.summary())

In [None]:
# For the ANOVA tests the BMI categories are summarized with the total BMI
# Once again we see that the p-value for the BMI_category is much greater than 0.05 and hence not significant.

stats=ols('HbA1c_Followup ~ HbA1c_Baseline + Age + Gender + BMI_Category + SBP', data=df).fit() 
print(sm.stats.anova_lm(stats, typ=2))

## Collinearity

### If you put 2 corrleated variables into a model they will compete to explain variation in the model output

In [None]:
# Previously we saw that when we combined all the other features with BMI, including HbA1c_Baseling, 
# the BMI did not significantly predict the output or explain the variance in the output. 
# However, let's see what happens when we just use BMI by itself as a predictor of the HbA1c.Followup levels.
# Here we see that the regression coeficient is sizable and the p-value is very significant implying 
# that BMI is of significant importance in predicting HbA1c.Followup

stats=ols('HbA1c_Followup ~ BMI', data=df).fit() 
print(stats.summary())

In [None]:
# But what if BMI is controlling or adjusting for HbA1c_Baseline?

stats=ols('HbA1c_Followup ~ HbA1c_Baseline + BMI', data=df).fit() 
print(stats.summary())

### Conclusion - while BMI related by iteself to HbA1c_followup is significant in predicting output (p-value~0)
### if controlled from where the patient's started i.e. Baseline, it is not significantly related to HbA1c_followup
### Effectively the two input features HbA1c_Baseline and BMI are collinear features.

In [None]:
stats=ols('HbA1c_Followup ~ HbA1c_Baseline + BMI', data=df).fit() 
print(sm.stats.anova_lm(stats, typ=2))

## Linear Regression using Sci-kit Learn

In [None]:
import pandas as pd

df = pd.read_csv("diabetes_treatment.csv")
df.head()

In [None]:
df["Gender"] = df["Female"].astype(int)
df.head()

In [None]:
[col for col in df.columns]

In [None]:
features = ['HbA1c.Baseline']
#features = ['HbA1c.Baseline', 'BMI', 'Age', 'SBP', 'Gender']

In [None]:
target = ['HbA1c.Followup']

In [None]:
x = df[features]

In [None]:
y = df[target]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [None]:
linrgr = LinearRegression()
linrgr.fit(x_train, y_train)

In [None]:
linrgr.score(x_test, y_test)

In [None]:
# model prediction for the test data input
y_pred = linrgr.predict(x_test)

# Here are the regression coefficients
print('Regression Coeffs:\n', linrgr.coef_)

# The coefficient of determination is an indicator of model performance or how much features explain target variance
print('Coeff of Determination - R^2:\n', r2_score(y_test, y_pred))

# The mean squared error indicates the error between the model predictions and the ground-truth data
print('MSE: \n', mean_squared_error(y_test, y_pred))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#sns.scatterplot(df['BMI'], df_diabetes['HbA1c.Followup']);

sns.scatterplot(df['HbA1c.Baseline'], df['HbA1c.Followup']);
plt.plot(x_test, y_pred, color='green', linewidth=2)