In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Lasso
import numpy as np

# 1 Importing the load_diabetes data 

In [2]:
processed_diabetes_data = load_diabetes()
print (processed_diabetes_data)

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]]), 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
  

# 2 Splitting the processed data intp train and test sets

In [3]:
X = processed_diabetes_data['data']
y = processed_diabetes_data['target']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=295)

# 3 To compute train and test R^2 with Lasso score

In [4]:
lasso = Lasso().fit(X_train,y_train)
lasso.score(X_train,y_train)

0.38620259533864876

In [5]:
lasso.score(X_test,y_test)

0.3135534507150195

Getting the features and number of features

In [6]:
features = dict(zip(processed_diabetes_data['feature_names'], lasso.coef_ != 0))

In [7]:
features

{'age': False,
 'sex': False,
 'bmi': True,
 'bp': False,
 's1': False,
 's2': False,
 's3': False,
 's4': False,
 's5': True,
 's6': False}

In [8]:
np.sum(lasso.coef_ != 0)

2

The model uses 2 features for which coef_ is not equal to zero, the features are
1. bmi
2. s5

# 4 Importing the original diabetes data set

In [9]:
X_raw = np.genfromtxt("Desktop\MSC\CS5920J-Machine learning\Assignment 2\diabetes.data",delimiter="\t",usecols=np.arange(10),skip_header=1)
y_raw = np.genfromtxt("Desktop\MSC\CS5920J-Machine learning\Assignment 2\diabetes.data",delimiter="\t",usecols = 10, dtype=int,skip_header=1)

In [10]:
X_raw

array([[59.    ,  2.    , 32.1   , ...,  4.    ,  4.8598, 87.    ],
       [48.    ,  1.    , 21.6   , ...,  3.    ,  3.8918, 69.    ],
       [72.    ,  2.    , 30.5   , ...,  4.    ,  4.6728, 85.    ],
       ...,
       [60.    ,  2.    , 24.9   , ...,  3.77  ,  4.1271, 95.    ],
       [36.    ,  1.    , 30.    , ...,  4.79  ,  5.1299, 85.    ],
       [36.    ,  1.    , 19.6   , ...,  3.    ,  4.5951, 92.    ]])

In [11]:
y_raw

array([151,  75, 141, 206, 135,  97, 138,  63, 110, 310, 101,  69, 179,
       185, 118, 171, 166, 144,  97, 168,  68,  49,  68, 245, 184, 202,
       137,  85, 131, 283, 129,  59, 341,  87,  65, 102, 265, 276, 252,
        90, 100,  55,  61,  92, 259,  53, 190, 142,  75, 142, 155, 225,
        59, 104, 182, 128,  52,  37, 170, 170,  61, 144,  52, 128,  71,
       163, 150,  97, 160, 178,  48, 270, 202, 111,  85,  42, 170, 200,
       252, 113, 143,  51,  52, 210,  65, 141,  55, 134,  42, 111,  98,
       164,  48,  96,  90, 162, 150, 279,  92,  83, 128, 102, 302, 198,
        95,  53, 134, 144, 232,  81, 104,  59, 246, 297, 258, 229, 275,
       281, 179, 200, 200, 173, 180,  84, 121, 161,  99, 109, 115, 268,
       274, 158, 107,  83, 103, 272,  85, 280, 336, 281, 118, 317, 235,
        60, 174, 259, 178, 128,  96, 126, 288,  88, 292,  71, 197, 186,
        25,  84,  96, 195,  53, 217, 172, 131, 214,  59,  70, 220, 268,
       152,  47,  74, 295, 101, 151, 127, 237, 225,  81, 151, 10

# 5 Splitting the raw diabetes data

In [12]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw,y_raw,random_state=295)

To compute train and test R^2 for original diabetes data with Lasso score

In [13]:
lasso_raw = Lasso().fit(X_train_raw,y_train_raw)
lasso_raw.score(X_train_raw,y_train_raw)

0.5365825063419298

In [14]:
lasso_raw.score(X_test_raw,y_test_raw)

0.41738603558907017

# 6 Getting the features and number of features for raw diabetes data

In [15]:
np.sum(lasso_raw.coef_ != 0)

10

In [16]:
features_raw = dict(zip(processed_diabetes_data['feature_names'], lasso_raw.coef_ != 0))

In [17]:
features_raw

{'age': True,
 'sex': True,
 'bmi': True,
 'bp': True,
 's1': True,
 's2': True,
 's3': True,
 's4': True,
 's5': True,
 's6': True}

The model on the raw diabetes dataset uses 10 features for which coef_ is not equal to zero, they are listed below:

1. age
2. sex
3. bmi
4. bp
5. s1
6. s2
7. s3
8. s4
9. s5
10. s6

# To summarize first section

The training R^2 = 0.5365825063419298 and test R^2 = 0.41738603558907017 of the raw diabetes data is higher than the training R^2 =0.38620259533864876 and test R^2 = 0.3135534507150195 of the load_diabetes data because it the raw diabetes data uses more features than the load_diabetes.

# 7. Using StandardScaler to process the raw data

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()
scaler.fit(X_train_raw)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [20]:
processed_X_train = scaler.transform(X_train_raw)
processed_X_test = scaler.transform(X_test_raw)

In [21]:
processed_lasso = Lasso().fit(processed_X_train,y_train_raw)
processed_lasso.score(processed_X_train,y_train_raw)

0.5385916123719636

In [22]:
processed_lasso.score(processed_X_test,y_test_raw)

0.4206927175381302

# 8. Getting the features and number of features for Scaled diabetes data

In [23]:
np.sum(processed_lasso.coef_ != 0)

8

In [24]:
features_raw = dict(zip(processed_diabetes_data['feature_names'], processed_lasso.coef_ != 0))

In [25]:
features_raw

{'age': False,
 'sex': True,
 'bmi': True,
 'bp': True,
 's1': True,
 's2': False,
 's3': True,
 's4': True,
 's5': True,
 's6': True}

The model on the Scaled diabetes dataset uses 8 features for which coef_ is not equal to zero, they are listed below:

1. sex
2. bmi
3. bp
4. s1
5. s3
6. s4
7. s5
8. s6

# in summary

The training R^2 = 0.5385916123719636 and test R^2 = 0.4206927175381302 of the scaled diabetes data is closer to (6) the raw diabetes data because the data at (3) uses normalizer and standard caler was used in these, also comparing the number of features used for the coefficient of the scaled data which is 8, it was always going to be closer to the raw data with a value of 10.

# 9 Varying the parameter alpha in the Lasso and plotting the test R^2 vs the number of features used


In [26]:
def lasso(alphas):
    
    lasso = lasso = Lasso(alphas)
    lasso.fit(processed_X_train, y_train_raw)
    scores = lasso.score(processed_X_test,y_test_raw)
    cof= np.sum(lasso.coef_ !=0)
    return scores, cof

In [27]:
alpha1=lasso([0.1])
alpha1

(0.42192757352944504, 10)

In [28]:
alpha2=lasso([1])
alpha2

(0.4206927175381302, 8)

In [29]:
alpha3=lasso([5])
alpha3

(0.40418984269451175, 6)

In [30]:
alpha4=lasso([3])
alpha4

(0.4145201979520702, 7)

In [31]:
alpha5=lasso([10])
alpha5

(0.3877008841826438, 5)

plotting the graph

In [32]:
varying_alphas =[alpha1[0],alpha2[0],alpha3[0],alpha4[0],alpha5[0]]

In [33]:
coefficients= [alpha1[1],alpha2[1],alpha3[1],alpha4[1],alpha5[1]]

In [34]:
import matplotlib.pyplot as plt
plt.plot(coefficients,varying_alphas,"v--",label="R Squared")
plt.xlabel("Number of Features")
plt.ylabel("Test R^2")
plt.legend()

<matplotlib.legend.Legend at 0x2c5389992e8>

I prefer the point (8, 0.42) correspoding to alpha = 1 for the lasso, it seems like the point where the graph finally got some form of stability or pattern

# 10 cross-validation on the training set.

Performing 10 fold cross validation 

In [35]:
from sklearn.model_selection import GridSearchCV
chosen_alphas = {'alpha': [0.1, 1, 5, 3, 10]}
grid_search = GridSearchCV(Lasso(), chosen_alphas, cv=10)
grid_search.fit(processed_X_train, y_train_raw)
grid_search.score(processed_X_test, y_test_raw)

0.4145201979520702

In [36]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'alpha': 3}
0.47586662663995877


In [37]:
lasso_crossval = Lasso(alpha=3).fit(processed_X_train,y_train_raw)
lasso_crossval.score(processed_X_train,y_train_raw)

0.5284326612822037

In [38]:
lasso_crossval.score(processed_X_test,y_test_raw)

0.4145201979520702

In [39]:
np.sum(lasso_crossval.coef_!=0)

7

# In Summary


Using for 10 fold cross validation, the best alpha is 3
When the alpha is 3, the Training R^2 = 0.5284326612822037 and the Test R^2 = 0.4145201979520702
The number of features used in the model is 7(seven)

# 11. Inductive Conformal Predictor

In [40]:
X_train_conformal, X_calibration_set, y_train_conformal, y_calibration_set = train_test_split(X_raw,y_raw,random_state=295,test_size=99)

In [41]:
X_train_conformal

array([[ 60.    ,   2.    ,  32.1   , ...,   4.    ,   4.4773,  94.    ],
       [ 58.    ,   1.    ,  22.8   , ...,   4.    ,   4.9836, 115.    ],
       [ 55.    ,   1.    ,  32.9   , ...,   4.    ,   4.4308,  89.    ],
       ...,
       [ 63.    ,   2.    ,  25.5   , ...,   5.    ,   5.9506,  87.    ],
       [ 49.    ,   1.    ,  20.3   , ...,   3.    ,   4.6052,  93.    ],
       [ 41.    ,   1.    ,  20.2   , ...,   3.    ,   4.2485,  89.    ]])

b) standard scaling

In [42]:
scalerconformal = StandardScaler()
scalerconformal.fit(X_train_conformal)
X_train_conformal_scaled = scalerconformal.transform(X_train_conformal)
X_test_scaled = scalerconformal.transform(X_test_raw)
X_calibration_set_scaled = scalerconformal.transform(X_calibration_set)

In [43]:
X_test_scaled

array([[ 0.86888764,  1.05701653, -0.31764873, ..., -0.22535482,
        -0.99643223,  0.31358668],
       [ 0.03788306, -0.946059  , -0.07275934, ..., -0.82179194,
        -0.18701702, -0.30255212],
       [ 0.56670415, -0.946059  , -0.13954736, ..., -0.04719827,
         0.93064499,  0.31358668],
       ...,
       [ 1.4732546 ,  1.05701653, -0.3399114 , ..., -0.04719827,
        -1.49410007, -0.39057195],
       [-1.47303435,  1.05701653, -1.8315104 , ..., -0.04719827,
        -0.27161471, -1.1827504 ],
       [ 0.1889748 , -0.946059  , -0.25086072, ...,  0.72739539,
         0.5024056 , -0.12651246]])

c) nonconformity measure = |truelabel - prediction|

In [44]:
lassoconformal = Lasso().fit(X_train_conformal_scaled,y_train_conformal)
true_label = lassoconformal.predict(X_train_conformal)
prediction = lassoconformal.predict(X_calibration_set_scaled)
lassoconformal.score(X_train_conformal_scaled,y_train_conformal)

0.5327268369010514

In [45]:
prediction

array([123.80910691, 138.94485414, 148.77809504, 158.74104021,
       126.56981827,  93.64400885, 273.61303532, 147.20690302,
       225.62765374,  69.23914972, 221.86209744,  78.68569344,
       200.41014118, 162.38583124, 241.40280993, 149.22993367,
       231.07425415, 259.30149226, 155.26582006, 157.27901635,
       172.13185638, 156.1570721 , 190.41429134, 123.01181416,
       136.93976095,  55.59063669,  87.39596123,  54.01913144,
       200.74681371, 122.78606572, 179.75827297, 150.21738711,
       154.14026758, 130.33310824, 100.54465166, 233.15526901,
       142.18384533, 103.09627561,  56.3835797 , 184.72113337,
       143.52245262, 106.14122452, 227.71886805, 174.68712927,
       176.903333  , 133.37277408, 150.22095319, 225.26290451,
       217.77015476,  51.85542431, 172.48691297,  67.91786764,
       119.241221  , 147.68105166, 160.23579568, 185.27184799,
        99.06109478, 188.10264602,  75.98843763, 232.73952735,
       121.78571647, 136.8247002 ,  91.44738574, 112.26

In [46]:
m=99
interval5  = (1 - 0.05) *(m +1)

In [47]:
interval20=(1-0.2) *(m +1)

In [48]:
alphas = abs(np.subtract(y_calibration_set ,prediction))
alphas = sorted(alphas)

In [49]:
alphas


[0.1570721028676587,
 0.2629045124132574,
 0.5906366894597284,
 0.6130353214851425,
 1.0551140776330783,
 1.6856934397863768,
 2.76085027843979,
 3.5854567671755007,
 3.855424313598249,
 4.430181732765476,
 4.9257458505906015,
 8.1908930916656,
 9.383579696052777,
 10.134314682419799,
 11.681051660060916,
 11.869778574832196,
 12.061094781916253,
 12.414291335351578,
 12.544651663540435,
 12.824700204362273,
 12.828937351664962,
 14.666891764001377,
 15.096275607797779,
 15.137902561366388,
 15.22993367012441,
 15.917867640686467,
 16.14122451855542,
 16.501296871964428,
 17.512953215377337,
 17.627653735523893,
 17.980868557296276,
 19.609171328895712,
 20.109164139003497,
 21.2139342762554,
 23.21937234889026,
 24.265820064268212,
 24.271847987635965,
 24.402809929485215,
 25.84473099313834,
 26.447385742576685,
 28.518393757468345,
 28.52245261740393,
 30.728057778675833,
 30.859732417549992,
 31.039293161902492,
 34.41348855634584,
 38.644008853181134,
 39.26047265314307,
 39.83295

In [50]:
conformity_at_5 = alphas[int(interval5) -1 ]

In [51]:
conformity_at_20 = alphas[int(interval20) -1 ]

In [52]:
conformity_at_5

106.19951230035454

In [53]:
conformity_at_20

82.24172702903181

In [54]:
length_prediction_interval_5 = (conformity_at_5 * 2)
length_prediction_interval_20 = (conformity_at_20 * 2)

In [55]:
print ("Length of prediction interval for 5%:",length_prediction_interval_5)
print ("Length of prediction interval for 20%:",length_prediction_interval_20)

Length of prediction interval for 5%: 212.39902460070908
Length of prediction interval for 20%: 164.48345405806361


# test error rates at significance levels 5% and 20% 

In [56]:
p_5 = true_label + conformity_at_5
n_5 = true_label - conformity_at_5

In [57]:
p_20 = true_label + conformity_at_20
n_20 = true_label - conformity_at_20

In [58]:
p_5

array([1144.68474663,  948.33592179, 1535.56684111,  740.91296406,
        668.02274382, 1286.83604504, 1446.94778214, 1169.99867932,
        759.22142292,  780.65683514,  748.2365304 , 1346.13281496,
       1151.41373057,  754.72384949, 1571.46816646, 1251.26139798,
        832.53349192, 1155.35974055,  917.12169519, 1141.92545187,
       1720.1138165 ,  784.61557314, 1432.54897423, 1690.13474316,
        533.23043495, 1450.46328889,  681.90791531, 1616.50021301,
        785.37486226,  974.48528213, 1048.46050424, 1533.95915648,
        686.53707622,  872.68730827, 1101.56199662,  665.61762933,
       1121.16298139,  395.76753025, 1684.51955261,  747.18882804,
        753.62516936,  995.03352256,  873.70046899, 1683.00732577,
        920.70020443,  207.47912719,  775.55253022, 1133.47680583,
       1246.09222797, 1993.1138173 , 1103.95661858,  968.91809494,
        659.44447626,  907.56142307, 1161.54590849,  925.70132414,
       1255.49486035, 1315.51651399, 1277.58873298, 1257.87121

In [59]:
ter5 = []
for i in range(1,len(y_test_raw)):
    if n_5[i]<=y_test_raw[i] and p_5[i] >= y_test_raw[i]:
        ter5.append(True)
    else:
        ter5.append(False)

In [65]:
ter20 = []
for i in range(1,len(y_test_raw)):
    if n_20[i]<=y_test_raw[i] and p_20[i] >= y_test_raw[i]:
        ter20.append(True)
    else:
        ter20.append(False)

In [66]:
print ("Test error rate at 5% interval:", 1 - np.mean(ter5))
print ("Test error rate at 20% interval:", 1 - np.mean(ter20))

Test error rate at 5% interval: 0.990909090909091
Test error rate at 20% interval: 0.990909090909091
