In [50]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline


# Ridge Regression
**from sklearn.linear_model import Ridge**

In [51]:
concrete_compressive_strength = fetch_ucirepo(id=165) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24, test_size=0.3)


# With Linear Regression()

In [52]:
lr = LinearRegression()
lr.fit(X, y)
lr.intercept_,lr.coef_

(array([-23.33121358]),
 array([[ 0.11980433,  0.10386581,  0.08793432, -0.14991842,  0.2922246 ,
          0.01808621,  0.02019035,  0.11422207]]))

In [53]:
# R2 Score for Linear Regression()
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.5884767547970231

# With Ridge Regression()

In [54]:
ridge = Ridge()
ridge.fit(X, y)
ridge.intercept_,lr.coef_

(array([-23.32957301]),
 array([[ 0.11980433,  0.10386581,  0.08793432, -0.14991842,  0.2922246 ,
          0.01808621,  0.02019035,  0.11422207]]))

In [55]:
# R2 Score with Ridge Regression()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test, y_pred)

0.5771749099675626

# With Polinominal Transformation

In [56]:
poly = PolynomialFeatures(degree=3,include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_Poly_test = poly.transform(X_test)
ridge.fit(X_poly_trn,y_train)
y_pred = ridge.predict(X_Poly_test)
r2_score(y_test,y_pred)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.8695065063191985

In [57]:
df_coef = pd.DataFrame({'Col_names': list(X_poly_trn.columns),'Coef': list(ridge.coef_[0])})
print(df_coef.shape)
df_coef[df_coef['Coef'] > 0.0001]

(164, 2)


Unnamed: 0,Col_names,Coef
1,Blast Furnace Slag,0.636144
5,Coarse Aggregate,2.612272
6,Fine Aggregate,0.909094
7,Age,0.123618
8,Cement^2,0.010445
9,Cement Blast Furnace Slag,0.031224
10,Cement Fly Ash,0.008666
14,Cement Fine Aggregate,0.023399
16,Blast Furnace Slag^2,0.004927
17,Blast Furnace Slag Fly Ash,0.015132


In [58]:
print(df_coef[df_coef['Coef'] > 0.0001].shape)

(35, 2)


# Considering Different Values of alpha

In [59]:
ridge = Ridge(alpha = 0.22)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test, y_pred)

0.5771751967997472

**Ridge :-**
1. Parameters : b0, b1, b2, ......bp
2. Hyper parameter : alpha : Externally set

**Hyper-parameter Tuning**
1. Process of Searching best Hyper-Parameters

In [60]:
alphas = [0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 4, 10]
scores = []
for a in alphas:
    ridge = Ridge(alpha = a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    r2_score(y_test, y_pred)
    scores.append(r2_score(y_test,y_pred))
print(scores)

print("\nMax Score Is :        ",np.max(scores))
print("\nIndex of Maximum is : ",np.argmax(scores))

i_max = np.argmax(scores)
print("Best alpha is:",alphas[i_max])
print("Best score is:",scores[i_max])

[0.5771752740273375, 0.5771752409296147, 0.5771751673801079, 0.5771750570584018, 0.5771749099675626, 0.5771747261116779, 0.5771745422643093, 0.5771738069600134, 0.5771716018651836]

Max Score Is :         0.5771752740273375

Index of Maximum is :  0
Best alpha is: 0.01
Best score is: 0.5771752740273375


In [61]:
# Creating Many alpha values quickly 
# np.arange(0.01, 30, 20)
# np.linspace(0.0001,10,20)

In [62]:
alphas = np.arange(0.01, 30, 20)
scores = []
for a in alphas:
    ridge = Ridge(alpha = a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    r2_score(y_test, y_pred)
    scores.append(r2_score(y_test,y_pred))
print(scores)

print("\nMax Score Is :        ",np.max(scores))
print("\nIndex of Maximum is : ",np.argmax(scores))

i_max = np.argmax(scores)
print("Best alpha is:",alphas[i_max])
print("Best score is:",scores[i_max])

[0.5771752740273375, 0.5771679257657525]

Max Score Is :         0.5771752740273375

Index of Maximum is :  0
Best alpha is: 0.01
Best score is: 0.5771752740273375


In [63]:
alphas = np.linspace(0.0001,10,20)
scores = []
for a in alphas:
    ridge = Ridge(alpha = a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    r2_score(y_test, y_pred)
    scores.append(r2_score(y_test,y_pred))
print(scores)

print("\nMax Score Is :        ",np.max(scores))
print("\nIndex of Maximum is : ",np.argmax(scores))

i_max = np.argmax(scores)
print("Best alpha is:",alphas[i_max])
print("Best score is:",scores[i_max])

[0.5771752776681036, 0.5771750841198391, 0.5771748905810096, 0.5771746970516156, 0.577174503531658, 0.5771743100211375, 0.5771741165200551, 0.5771739230284114, 0.5771737295462072, 0.5771735360734433, 0.5771733426101202, 0.5771731491562393, 0.5771729557118006, 0.577172762276805, 0.5771725688512538, 0.577172375435147, 0.577172182028486, 0.5771719886312712, 0.5771717952435036, 0.5771716018651836]

Max Score Is :         0.5771752776681036

Index of Maximum is :  0
Best alpha is: 0.0001
Best score is: 0.5771752776681036


# Lasso Regression 
**from sklearn.linear_model import lasso**

In [64]:
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
r2_score(y_test, y_pred)

0.576333587787164

# Hyper-Parameter Optimization

In [65]:
alphas = np.linspace(0.0001,10,20)
scores = []
for a in alphas:
    lasso = Lasso(alpha = a)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    r2_score(y_test, y_pred)
    scores.append(r2_score(y_test,y_pred))
print(scores)

print("\nMax Score Is :        ",np.max(scores))
print("\nIndex of Maximum is : ",np.argmax(scores))

i_max = np.argmax(scores)
print("Best alpha is:",alphas[i_max])
print("Best score is:",scores[i_max])

[0.5771752162488388, 0.5767887675592975, 0.5762767463931009, 0.5756396636831074, 0.574877359379247, 0.5739906651543902, 0.572979464363941, 0.5721009707960271, 0.5722614778993542, 0.5724099561568741, 0.572544735498197, 0.5726666359648799, 0.5727764086956115, 0.5728731096945463, 0.5729565781568045, 0.5730274603207464, 0.573085805075227, 0.5731315172304163, 0.5731560475699367, 0.5731618042353835]

Max Score Is :         0.5771752162488388

Index of Maximum is :  0
Best alpha is: 0.0001
Best score is: 0.5771752162488388


# Elastic Net Regression
**from sklearn.linear_model import ElasticNet**

**0 < R <= 1**
- where,
- R = L1_Ratio
- When L1_ratio = 1, it is equivalent to Lasso

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24, test_size=0.3)
en = ElasticNet()
en.fit(X_train, y_train)
y_pred = en.predict(X_test)
r2_score(y_test, y_pred)

0.5766806310401154

In [67]:
alphas = np.linspace(0.0001, 10, 20)
l1 = np.linspace(0.0001, 1, 10)
scores = []
for a in alphas:
    for i in l1:
        en = ElasticNet(alpha = a, l1_ratio=i)
        en.fit(X_train, y_train)
        y_pred = en.predict(X_test)
        r2_score(y_test, y_pred)
        scores.append([a,i,r2_score(y_test,y_pred)])
df_scores = pd.DataFrame(scores, columns=['Alpha','L1 Ratio', 'Score'])
df_scores.sort_values('Score',ascending = False)
best_a = df_scores['Alpha'].iloc[0]
best_sc = df_scores['Score'].iloc[0]
best_l1 = df_scores['L1 Ratio'].iloc[0]
print("Best Alpha : ",best_a)
print("Best Score : ",best_sc)
print("Best L1 Ratio : ",best_l1)

Best Alpha :  0.0001
Best Score :  0.5771752511863831
Best L1 Ratio :  0.0001


# Searching Best Hyper-parameter for *Housing Dataset.csv*

In [68]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [69]:
housing = pd.read_csv('Housing.csv')
housing.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [70]:
X = housing[['driveway']] 
y = housing['price']

In [71]:
str_col = (housing.columns[housing.dtypes == 'object'])

In [72]:
num_col = (housing.columns[housing.dtypes != 'object'])

In [73]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas') 

In [74]:
ct = make_column_transformer(('passthrough',num_col),(ohe,str_col),verbose_feature_names_out=False).set_output(transform='pandas')
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


In [75]:
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False).set_output(transform='pandas')
X= housing.drop('price',axis=1)
y = housing['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [84]:
X_ohe_trn = ct.fit_transform(X_train)
X_ohe_tst = ct.transform(X_test)
lr.fit(X_ohe_trn,y_train)
y_pred = lr.predict(X_ohe_tst)
r2_score(y_test,y_pred)


0.6246856191453717

In [82]:
from sklearn.pipeline import Pipeline
lr = LinearRegression()
pipe = Pipeline([('trfn',ct),('LR',lr)])
pipe.fit(X_train,y_train)

In [83]:
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.6246856191453717

In [89]:
X_ohe_trn = ct.fit_transform(X_train)
X_ohe_tst = ct.transform(X_test)
alphas = np.linspace(0.0001, 10, 20)
l1 = np.linspace(0.0001, 1, 10)
scores = []
for a in alphas:
    for i in l1:
        en = ElasticNet(alpha = a, l1_ratio=i)
        en.fit(X_ohe_trn, y_train)
        y_pred = en.predict(X_ohe_tst)
        r2_score(y_test, y_pred)
        scores.append([a,i,r2_score(y_test,y_pred)])
df_scores = pd.DataFrame(scores, columns=['Alpha','L1 Ratio', 'Score'])
df_scores.sort_values('Score',ascending = False, inplace=True)
best_a = df_scores['Alpha'].iloc[0]
best_sc = df_scores['Score'].iloc[0]
best_l1 = df_scores['L1 Ratio'].iloc[0]
print("Best Alpha : ",best_a)
print("Best Score : ",best_sc)
print("Best L1 Ratio : ",best_l1)

Best Alpha :  0.0001
Best Score :  0.6246856181760903
Best L1 Ratio :  1.0
