# Basic imports for linear regression
##### import numpy as np
##### import pandas as pd
##### from sklearn import datasets
##### from sklearn.preprocessing import MinMaxScaler // for normalization
##### from sklearn.preprocessing import StandardScaler // for Standradization
##### from sklearn.model_selection import train_test_split
##### from sklearn.linear_model import LinearRegression
##### from sklearn.metrics import mean_squared_error
##### from sklearn.metrics import r2_score

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# import diabetes dataset

In [2]:
pdiabetes = datasets.load_diabetes()
x= pdiabetes.data
y= pdiabetes.target

# convert to dataframe

In [3]:
df_x=pd.DataFrame(data=x,columns=pdiabetes.feature_names)

In [4]:
df_y=pd.DataFrame(data=y,columns=["label"])

#display data

In [5]:
df_x.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [6]:
df_y.head()

Unnamed: 0,label
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0


#preprocessing
##### it is done to process and sort the data in a particular range.
### 2 types: 1. Normalization min_max scaling 2. Standardization

## Xscaled = (X1 - Xmin)/(Xmax - Xmin)
## standradization (X-μ)/σ

In [7]:
Scaler = MinMaxScaler()
x_scaler = Scaler.fit_transform(x)
x_scaler=pd.DataFrame(data=x_scaler,columns=pdiabetes.feature_names)

#display

In [8]:
x_scaler.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.666667,1.0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394
1,0.483333,0.0,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222437,0.166667
2,0.883333,1.0,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496578,0.409091
3,0.083333,0.0,0.301653,0.309859,0.495098,0.447211,0.233766,0.423131,0.572923,0.469697
4,0.516667,0.0,0.206612,0.549296,0.465686,0.417331,0.38961,0.282087,0.362385,0.333333


# trainig testing data split
### test_size means the ratio of training and testing data
### random_state=42; here 42 is the seed variable which is used to generate randomness based on this data cells are selected and by changing this sets can be changed and based on that models can be changed and the accuracy will be changed

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x_scaler,df_y,test_size=0.3,random_state=42)

#display

In [10]:
x_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
225,0.633333,1.0,0.471074,0.661972,0.308824,0.229084,0.116883,0.423131,0.728211,0.515152
412,0.833333,0.0,0.673554,0.71831,0.504902,0.409363,0.415584,0.282087,0.50946,0.818182
118,0.233333,1.0,0.305785,0.56338,0.534314,0.49502,0.220779,0.423131,0.56492,0.712121
114,0.6,0.0,0.768595,0.71831,0.5,0.262948,0.272727,0.370945,0.867703,0.590909
364,0.5,1.0,0.322314,0.380282,0.416667,0.383466,0.220779,0.423131,0.54263,0.863636


In [11]:
y_train.head()

Unnamed: 0,label
225,208.0
412,261.0
118,179.0
114,258.0
364,262.0


In [12]:
x_test.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
287,0.7,0.0,0.322314,0.394366,0.897059,0.765936,0.428571,0.423131,0.610446,0.484848
211,0.916667,0.0,0.487603,0.549296,0.362745,0.314741,0.363636,0.141044,0.398856,0.424242
72,0.783333,1.0,0.330579,0.408451,0.818627,0.522908,0.558442,0.282087,0.810909,0.439394
321,0.933333,0.0,0.545455,0.784085,0.647059,0.484064,0.090909,0.832158,0.865422,0.727273
73,0.55,1.0,0.268595,0.450704,0.588235,0.536853,0.337662,0.423131,0.465759,0.469697


In [13]:
y_test.head()

Unnamed: 0,label
287,219.0
211,70.0
72,202.0
321,230.0
73,111.0


#performing the linear regression

In [14]:
classifier = LinearRegression() # instance
model=classifier.fit(x_train, y_train)

#viewing the model

In [15]:
model.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

#print coefficient

In [16]:
print(model.coef_)

[[   6.37597963  -24.94631999  142.49164186   94.94103001 -253.17501262
   159.32689804   34.34557573   75.35836358  171.20845644   11.31152349]]


#prediction

In [17]:
y_predict = model.predict(x_test)

#display predict

In [18]:
pd.DataFrame(y_predict).head()

Unnamed: 0,0
0,138.4697
1,181.100523
2,125.344009
3,292.759773
4,123.883053


#determining mse

In [19]:
mse = mean_squared_error(y_test, y_predict)

#display mse

In [20]:
print(mse)

2821.750981001311


#determining rmse

In [21]:
rmse = mean_squared_error(y_test, y_predict , squared=False)

#display

In [22]:
rmse

53.120156070942706

#determing R2Score

In [23]:
r2 = r2_score(y_test, y_predict)

#display r2

In [24]:
print(r2)

0.4772897164322617
