<a href="https://colab.research.google.com/github/Amri1003/Machine_Learning_Projects/blob/main/PracticingRidgeANDElascticNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To resolve overfitting we need to use either Lasso or Ridge if it is known which one to choose based on the requirement or use Elastic Net if not sure

In [3]:
# import the library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# To avoid warnings
import warnings
warnings.filterwarnings('ignore', category = DeprecationWarning)
from warnings import filterwarnings
filterwarnings('ignore')

In [7]:
# import the dataset
from sklearn.datasets import load_diabetes
df = load_diabetes()
df.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [15]:

# Reload the diabetes dataset to get the original Bunch object
diabetes_bunch = load_diabetes()

# change bunch load_diabetes into panda dataframe
df = pd.DataFrame(diabetes_bunch.data, columns = diabetes_bunch.feature_names)
# Add the target variable to the DataFrame
df['target'] = diabetes_bunch.target
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
       'target'],
      dtype='object')

In [10]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [12]:
# look for missing value
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
bp,0
s1,0
s2,0
s3,0
s4,0
s5,0
s6,0


In [13]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [17]:
# Define independent variables (X) and dependent variable (y)
# target is chosen as y variable
X = df.drop('target', axis=1)
y = df['target']

print("Independent variables (X) shape:", X.shape)
print("Dependent variable (y) shape:", y.shape)

Independent variables (X) shape: (442, 10)
Dependent variable (y) shape: (442,)


In [18]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (353, 10)
X_test shape: (89, 10)
y_train shape: (353,)
y_test shape: (89,)


In [19]:
# apply standard scalar
from sklearn.preprocessing import StandardScaler

std=StandardScaler()

X_train=std.fit_transform(X_train)  # z_score=val-mean/std

X_test=std.transform(X_test)  # now just transform

In [20]:
X_train

array([[ 1.49836523,  1.06136988,  0.21990201, ...,  0.71103773,
         0.54748197, -0.06144896],
       [-0.22885822,  1.06136988, -0.41936607, ...,  1.4842858 ,
        -0.01975653,  0.36723647],
       [ 0.08518241, -0.94217861,  1.01898711, ..., -0.06221033,
         0.3312366 , -0.31866022],
       ...,
       [ 0.63475351, -0.94217861, -0.46502808, ..., -0.83545839,
        -0.25375196, -0.06144896],
       [-0.30736838, -0.94217861, -0.53352109, ..., -0.06221033,
        -0.83072436, -0.83308273],
       [-2.03459183, -0.94217861,  0.56236706, ..., -0.83545839,
        -0.13312789, -0.06144896]])

In [21]:
# apply linear regression
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()

lin_reg.fit(X_train,y_train)

print('Training completed')

Training completed


In [23]:
y_pred=lin_reg.predict(X_test)

y_pred

from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.45260276297191926

In [24]:
# evaluate model

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

print('mse',mean_squared_error(y_test,y_pred))

print('mae',mean_absolute_error(y_test,y_pred))

print('r2-score test data',r2_score(y_test,y_pred))

mse 2900.1936284934823
mae 42.79409467959994
r2-score test data 0.45260276297191926


In [25]:
#calculate r2 score on training data

y_pred_train=lin_reg.predict(X_train)

print('r2-score train data',r2_score(y_train,y_pred_train))

# very poor data coming so LR is not a good choice
# so non linear pattern is present.

r2-score train data 0.5279193863361498


In [30]:
# Lets use Ridge
from sklearn.linear_model import Ridge

ridge_model=Ridge(alpha=5,max_iter=10000)

ridge_model.fit(X_train,y_train)

In [31]:
y_pred_ridge=ridge_model.predict(X_test)

y_pred_train_ridge=ridge_model.predict(X_train)

In [32]:






print('r2-score test data ridge',r2_score(y_test,y_pred_ridge))

print('r2-score train data ridge',r2_score(y_train,y_pred_train_ridge))






r2-score test data ridge 0.4561445748045444
r2-score train data ridge 0.5259758580235809


In [29]:
# check coeff, all values are present , not shrinked to zero

ridge_model.coef_

array([  1.80734179, -11.44818951,  25.73269892,  16.73429974,
       -34.67195409,  17.05307485,   3.36991411,  11.76426044,
        31.3783838 ,   2.45813922])

In [34]:
# To calculate optimal value of alpha
from sklearn.linear_model import RidgeCV

RidgeCV_model=RidgeCV(alphas=np.random.randint(1,1000,100),cv=10)

RidgeCV_model.fit(X_train,y_train)

print(RidgeCV_model.alpha_)










53


In [35]:
# Lets run again all above commands of Ridge model with alpha =53
# Lets use Ridge
from sklearn.linear_model import Ridge

ridge_model=Ridge(alpha=53,max_iter=10000)

ridge_model.fit(X_train,y_train)

In [37]:
y_pred_ridge=ridge_model.predict(X_test)

y_pred_train_ridge=ridge_model.predict(X_train)

In [38]:






print('r2-score test data ridge',r2_score(y_test,y_pred_ridge))

print('r2-score train data ridge',r2_score(y_train,y_pred_train_ridge))






r2-score test data ridge 0.4610992199531223
r2-score train data ridge 0.5194172316553419


In [39]:
# check coeff, all values are present , not shrinked to zero

ridge_model.coef_

array([ 2.02725013, -9.48114579, 23.51109918, 15.02805746, -4.52427019,
       -4.26594694, -9.11738956,  7.05019642, 18.2347285 ,  4.02182487])

In [40]:
# Using Elastic net
# elastic net
from sklearn.linear_model import ElasticNet
elastic_net=ElasticNet()

elastic_net.fit(X_train,y_train)




y_pred_elastic_net=elastic_net.predict(X_test)

y_pred_train_elastic_net=elastic_net.predict(X_train)












In [41]:
print('r2-score test data elastic_net',r2_score(y_test,y_pred_elastic_net))

print('r2-score train data elastic_net',r2_score(y_train,y_pred_train_elastic_net))






r2-score test data elastic_net 0.45477123774939965
r2-score train data elastic_net 0.49330296919620475


In [None]:
# We need to choose another model(Non Linear ones) for accuracy perspective but here we could successfully
# minimise the gap between test and train r2 score.