In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load diabetes dataset
diabetes = load_diabetes()

# Convert to DataFrame
df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target  # Add target variable to the dataframe

In [3]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [4]:
df.shape

(442, 11)

In [5]:
df.dtypes

age       float64
sex       float64
bmi       float64
bp        float64
s1        float64
s2        float64
s3        float64
s4        float64
s5        float64
s6        float64
target    float64
dtype: object

In [6]:
df.isnull().sum()

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

In [8]:
x = df.drop("target", axis=1)

In [9]:
x

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [11]:
y = df.target

In [12]:
y

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [13]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_text= train_test_split(x,y,test_size=0.25, random_state=40)

In [37]:
X_train

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
23,0.045341,0.050680,0.060618,0.031065,0.028702,-0.047347,-0.054446,0.071210,0.133597,0.135612
54,-0.049105,-0.044642,0.025051,0.008101,0.020446,0.017788,0.052322,-0.039493,-0.041176,0.007207
396,-0.085430,0.050680,-0.030996,-0.022885,-0.063487,-0.054236,0.019187,-0.039493,-0.096435,-0.034215
58,0.041708,-0.044642,-0.064408,0.035644,0.012191,-0.057994,0.181179,-0.076395,-0.000612,-0.050783
169,-0.001882,-0.044642,-0.026684,0.049415,0.058973,-0.016032,-0.047082,0.071210,0.133597,0.019633
...,...,...,...,...,...,...,...,...,...,...
306,0.009016,0.050680,-0.001895,0.021872,-0.038720,-0.024800,-0.006584,-0.039493,-0.039809,-0.013504
165,-0.041840,-0.044642,-0.066563,-0.046985,-0.037344,-0.043276,0.048640,-0.039493,-0.056153,-0.013504
7,0.063504,0.050680,-0.001895,0.066629,0.090620,0.108914,0.022869,0.017703,-0.035816,0.003064
219,-0.089063,-0.044642,-0.041774,-0.019442,-0.066239,-0.074277,0.008142,-0.039493,0.001148,-0.030072


In [38]:
X_test

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
192,0.056239,0.050680,-0.030996,0.008101,0.019070,0.021233,0.033914,-0.039493,-0.029526,-0.059067
337,0.019913,0.050680,-0.012673,0.070072,-0.011201,0.007141,-0.039719,0.034309,0.005386,0.003064
56,-0.041840,-0.044642,0.041218,-0.026328,-0.031840,-0.030437,-0.036038,0.002943,0.033654,-0.017646
9,-0.070900,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504
232,0.012648,0.050680,0.000261,-0.011420,0.039710,0.057245,-0.039719,0.056081,0.024055,0.032059
...,...,...,...,...,...,...,...,...,...,...
137,0.005383,-0.044642,0.049840,0.097615,-0.015328,-0.016345,-0.006584,-0.002592,0.017036,-0.013504
172,0.041708,0.050680,0.071397,0.008101,0.038334,0.015909,-0.017629,0.034309,0.073407,0.085907
163,0.016281,0.050680,0.072474,0.076958,-0.008449,0.005575,-0.006584,-0.002592,-0.023647,0.061054
384,0.034443,0.050680,-0.029918,0.004658,0.093372,0.086994,0.033914,-0.002592,0.024055,-0.038357


In [39]:
y_train

23     245.0
54     182.0
396     43.0
58     170.0
169    152.0
       ...  
306     44.0
165     59.0
7       63.0
219    185.0
326    131.0
Name: target, Length: 331, dtype: float64

In [40]:
y_text

192     91.0
337     91.0
56      52.0
9      310.0
232    259.0
       ...  
137    280.0
172    295.0
163    131.0
384     69.0
240    275.0
Name: target, Length: 111, dtype: float64

In [19]:
from sklearn.preprocessing import StandardScaler

In [41]:
scaler = StandardScaler()

In [43]:
x_train_scaler= scaler.fit_transform(X_train)

In [44]:
x_train_scaler

array([[ 0.9057857 ,  1.06559086,  1.35108273, ...,  1.52678703,
         2.73541049,  2.82924795],
       [-1.08481485, -0.93844649,  0.58555328, ..., -0.80388095,
        -0.83523399,  0.19151279],
       [-1.85043045,  1.06559086, -0.62073554, ..., -0.80388095,
        -1.96417818, -0.65936953],
       ...,
       [ 1.2885935 ,  1.06559086,  0.00560673, ...,  0.40029751,
        -0.72572902,  0.10642456],
       [-1.92699201, -0.93844649, -0.85271416, ..., -0.80388095,
         0.02944562, -0.57428129],
       [ 0.9057857 ,  1.06559086, -0.13358044, ...,  0.7498977 ,
         0.6175693 ,  0.61695394]])

In [46]:
x_test_scaler= scaler.transform(X_test)

In [47]:
from sklearn.linear_model import LinearRegression

In [49]:
lr= LinearRegression()

In [50]:
lr.fit(x_train_scaler,y_train)

In [51]:
lr.coef_

array([  0.38122888, -11.03600039,  25.2298113 ,  14.99980192,
       -43.43648391,  24.42996039,   5.73891846,   7.84295545,
        39.95621892,   1.42541662])

In [None]:
y = mx + c

In [52]:
lr.intercept_

148.54682779456192

In [59]:
y_pred= lr.predict(x_test_scaler)

In [60]:
y_pred

array([ 90.04737635, 173.10584051, 210.88512912, 216.13741144,
       153.97658435, 222.93707265, 150.0060872 ,  86.55650062,
       175.60147091, 141.09141771, 168.27397839, 119.11298201,
        54.37002238, 166.67731183,  71.6621661 , 185.07672221,
       153.55302163, 144.04023749,  96.79754726, 145.56346059,
        57.35368382, 257.80934907, 138.41530507, 138.13259871,
        69.1380572 , 189.90509515, 141.51355721, 113.62013065,
       177.52564455,  70.90237965, 163.74822038, 147.43114904,
       125.35582175, 161.39541123,  93.2406498 ,  57.87627589,
       116.01696335,  69.91083967, 204.21749814, 147.32542574,
       154.50250249,  72.45537663, 166.96944557, 149.9599814 ,
       194.57856663, 147.89136257,  94.40601832, 172.07136146,
        93.65827943, 257.57007183, 217.05571316, 168.04540888,
       231.39208913, 211.99090135, 130.77731655, 194.79465132,
       119.25858005, 132.8886766 , 118.77829158, 165.52488986,
       174.53639912,  52.25737489, 189.67254626, 168.98

In [63]:
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [64]:
mean_squared_error(y_text,y_pred)

4005.2432354503258

In [65]:
r2_score(y_text,y_pred)

0.3944095025451173

In [66]:
lr.score(x_train_scaler,y_train)

0.5580273038679142

In [69]:
df.corr()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
age,1.0,0.173737,0.185085,0.335428,0.260061,0.219243,-0.075181,0.203841,0.270774,0.301731,0.187889
sex,0.173737,1.0,0.088161,0.24101,0.035277,0.142637,-0.37909,0.332115,0.149916,0.208133,0.043062
bmi,0.185085,0.088161,1.0,0.395411,0.249777,0.26117,-0.366811,0.413807,0.446157,0.38868,0.58645
bp,0.335428,0.24101,0.395411,1.0,0.242464,0.185548,-0.178762,0.25765,0.39348,0.39043,0.441482
s1,0.260061,0.035277,0.249777,0.242464,1.0,0.896663,0.051519,0.542207,0.515503,0.325717,0.212022
s2,0.219243,0.142637,0.26117,0.185548,0.896663,1.0,-0.196455,0.659817,0.318357,0.2906,0.174054
s3,-0.075181,-0.37909,-0.366811,-0.178762,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697,-0.394789
s4,0.203841,0.332115,0.413807,0.25765,0.542207,0.659817,-0.738493,1.0,0.617859,0.417212,0.430453
s5,0.270774,0.149916,0.446157,0.39348,0.515503,0.318357,-0.398577,0.617859,1.0,0.464669,0.565883
s6,0.301731,0.208133,0.38868,0.39043,0.325717,0.2906,-0.273697,0.417212,0.464669,1.0,0.382483
