In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
plt.style.use("ggplot")

# Linear Regression

We will perform our analysis on the diabetes dataset. There are ten baseline variables. Namely, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

**PS:they are all normalised**

In [None]:
#Load the require package
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error

#Load the diabetes dataset, note that all the
diabetes = pd.read_csv("diabetes.csv",sep=",")
del diabetes["Unnamed: 0"]

# Exploratory Data Analysis

"EDA is an attitude, a flexibility, and a reliance on display, NOT a bundle of techniques." 

A good EDA can give us more insights in performing feature selection and feature engineering,
thus resulting in a more accurate model

In [None]:
# Examine the covariates


In [None]:
#Examine the top 5 entries
diabetes.head(5)

In [None]:
#Short summary
diabetes.describe()

In [None]:
# Understand how is age distributed by plotting a histogram


In [None]:
# Understand how is the BMI distributed and also its mean 


In [None]:
# Okay lets see if men have higher or lower BMI than women


In [None]:
# Do that for blood pressure


In [None]:
# Okay now we ask, is this different significant ?
# conduct hypothesis test to see it!


Since we have a very small p-value, there is high evidence that the two samples of data does not share the mean.

# Modeling 

From the above EDA, we can see that gender is an important factor to be included in the model. We should also include interaction temrs between gender and other variables as well.

In [None]:
# Including interaction for gender and BP and gender and BMI
diabetes["SEXxBMI"] = diabetes["SEX"]*diabetes["BMI"]
diabetes["SEXxBP"] = diabetes["SEX"]*diabetes["BP"]

In [None]:
#Check it works 
diabetes.head(5)

In [None]:
# First of all, train_test_split
from sklearn.model_selection import train_test_split

#Setup 
y = diabetes["Y"]
del diabetes["Y"]
X = diabetes

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#Now we can fit our model with extra interaction variables
#Import packages
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

#Create the Model object

#fit the model using the training sets

#Make predictions using the test set


# Model evaluation

Because of the simple structure of linear model, there are many ways we can interpret the model. 

In [None]:
# looking into its coefficients
pd.DataFrame(regr.coef_,columns=["Coefficient"],index=diabetes.columns).T

In [None]:
#Print the mean squared error


# Comparison with a simpler model

We now predict the outcome only using age as an input

In [None]:
#Now we can fit our model with extra interaction variables
#Import packages
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

#Create the Model object
regr_2 = linear_model.LinearRegression()

#fit the model using the training sets
regr_2.fit(X_train["AGE"].reshape(-1,1),y_train)

#Make predictions using the test set
y_pred_2 = regr_2.predict(X_test["AGE"].reshape(-1,1))

In [None]:
#Print the mean squared error
print("RMSE: %.2f" % np.sqrt(mean_squared_error(y_test,y_pred_2)))

In [None]:
#Plotting the results
plt.plot(X_test["AGE"],y_pred_2,lw=3,color="black")
plt.scatter(X_test["AGE"],y_test)