# Regularization

### Two Types
1. Ridge (L2): Penalties Applied on Square of Coeff
2. Lasso (L1): Penalties Applied on Absolute Value of Coeff

In [26]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 1 - Data Ingestion

In [27]:
import pandas as pd

df = pd.read_csv("Cars93.csv", na_values=["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


# Step 2 - Basic Data Quality Checks

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  94 non-null     int64  
 1   Manufacturer        94 non-null     object 
 2   Model               94 non-null     object 
 3   Type                94 non-null     object 
 4   Min.Price           94 non-null     float64
 5   Price               94 non-null     float64
 6   Max.Price           94 non-null     float64
 7   MPG.city            94 non-null     int64  
 8   MPG.highway         94 non-null     int64  
 9   AirBags             90 non-null     object 
 10  DriveTrain          94 non-null     object 
 11  Cylinders           94 non-null     object 
 12  EngineSize          94 non-null     float64
 13  Horsepower          94 non-null     int64  
 14  RPM                 94 non-null     int64  
 15  Rev.per.mile        94 non-null     int64  
 16  Man.trans.

In [29]:
m = df.isna().sum()
m[m > 0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [30]:
df.duplicated().sum()

np.int64(1)

In [31]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


In [32]:
df.shape

(93, 28)

# Step 3 - Separate X & Y(Weight)
1. ID is Called Statiscally Insignificant

In [33]:
X = df.drop(columns=["id", "Weight"])
Y = df[["Weight"]]

In [34]:
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [35]:
Y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


# Step 4 - Preprocessing

In [36]:
cat = list(X.columns[X.dtypes == "object"])
con = list(X.columns[X.dtypes != "object"])

In [37]:
cat

['Manufacturer',
 'Model',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin',
 'Make']

In [38]:
con

['Min.Price',
 'Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room']

In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [44]:
num_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

In [45]:
# Instead of strategy="most_frequent", constant can also be used
# to fill missing values with a specific value
# fill_value="Not Available" is used to fill missing values in categorical columns
# handle_unknown="ignore" ensures that any unknown categories in the test set are ignored
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Not Available"),
    OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False),
)

In [46]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, con),
        ("cat", cat_pipe, cat),
    ]
).set_output(transform="pandas")

In [47]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.388017,1.497844,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.008658,0.998227,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.755752,1.091905,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 5 - Train & Test

In [48]:
from sklearn.model_selection import train_test_split

In [50]:
xtrain, xtest, ytrain, ytest = train_test_split(
    X_pre, Y, test_size=0.2, random_state=21
)

In [51]:
xtrain.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
57,1.365026,1.28967,1.185041,-0.423219,-0.016221,-0.356418,-0.265452,-0.304365,0.18791,-0.663618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,-1.003095,-0.979424,-0.911397,0.1135,0.172352,-0.841022,-0.323043,2.054464,0.157535,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,0.606307,0.685966,0.729294,-0.781032,-0.959087,0.322027,1.116716,1.212025,-0.247462,0.716035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.043016,-0.021825,-0.063707,-0.423219,-0.204794,0.806631,1.347077,0.87505,-0.71321,0.409445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,1.985795,1.726835,1.449374,-0.781032,-0.581941,1.872759,1.27029,-1.146804,-0.996708,1.022624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
ytrain.head()

Unnamed: 0,Weight
57,2920
31,2530
62,3730
29,3490
51,4055


In [53]:
xtest.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
23,-1.003095,-0.85452,-0.701753,0.1135,-0.016221,-0.453339,-0.975733,-0.809828,0.532158,-0.816912,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.203957,0.332071,0.4285,-0.781032,-1.336233,-0.259498,-0.111878,-0.472853,0.370159,0.961306,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,0.537333,0.332071,0.145937,-0.244313,-0.204794,-0.356418,-0.572601,0.201098,-0.237337,-0.265051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21,1.422504,1.039862,0.692834,-0.423219,-0.581941,0.612789,0.060893,-0.809828,-1.108083,-0.203734,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.100495,-0.073868,-0.209546,-0.959938,-0.581941,2.260442,0.502419,-1.820755,-1.988953,1.942392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
ytest.head()

Unnamed: 0,Weight
23,2670
86,3785
91,2985
21,3570
17,3910


In [55]:
xtrain.shape

(74, 249)

In [56]:
xtest.shape

(19, 249)

# Step 6 - Model Building

### Linear Regression

In [63]:
from sklearn.linear_model import LinearRegression

model0 = LinearRegression()
model0.fit(xtrain, ytrain)

In [64]:
model0.intercept_

array([3072.93772431])

In [65]:
model0.coef_

array([[-2.91066653e-01,  5.99642182e-01,  1.57944459e+00,
        -3.03193174e+01, -7.07636219e+01,  6.83960907e+01,
         8.61040961e+01, -2.04104346e+01, -4.17210736e-01,
         3.76033822e+01,  4.55652561e+01,  5.97938685e+01,
         1.57266464e+02,  7.43315641e+01, -6.64670974e-01,
        -2.53326735e+01,  1.07374611e+01,  6.95499265e+01,
         7.69669720e+01, -5.10818137e+01, -4.83614086e+01,
        -2.10230559e+01,  4.26325641e-14,  1.90827777e+01,
         6.74660951e+00, -4.08549899e+01, -1.46701805e+01,
        -1.72916699e+00,  1.76812310e+01, -7.79987281e+00,
        -1.03543808e+01,  2.55295298e+01,  3.41440949e+01,
        -1.96704101e+01, -5.27698339e+01, -2.10828726e+00,
         7.25321291e+00,  4.45149371e+01, -2.94143440e+01,
         4.10479178e+01,  9.07067796e+00, -6.18126254e+01,
        -7.13103584e+00,  4.01484360e+01, -1.77635684e-14,
        -1.74598500e+01,  3.08993101e+01, -4.27430874e+00,
        -3.25463800e+01,  2.13162821e-14, -2.02234539e+0

In [66]:
model0.score(xtrain, ytrain)

1.0

In [67]:
model0.score(xtest, ytest)

0.9328654662875597

### Ridge

In [None]:
from sklearn.linear_model import Ridge

# Ridge regression is a type of linear regression that includes a regularization term
# to prevent overfitting by penalizing large coefficients.
model1 = Ridge(alpha=1.0)
model1.fit(xtrain, ytrain)

In [60]:
model1.intercept_

array([3068.06476076])

In [61]:
model1.coef_

array([  3.37799994,   2.21911405,   1.22086315, -31.38867953,
       -69.36955122,  63.49453014,  80.00262866, -19.66069141,
        -0.61029152,  46.26809641,  48.87374915,  57.45632053,
       150.50247361,  71.63104126,   5.64024632, -21.76619613,
         7.739072  ,  52.29535072,  58.86311264, -42.69089793,
       -36.83262649, -18.4381718 ,   0.        ,  14.39943604,
         5.86249165, -34.35354152, -15.87350641,  -2.05458254,
        16.48385065,  -9.82585879,  -6.67696088,  16.94963697,
        25.47402488, -17.13158425, -39.84041339,   2.41210104,
         5.58403572,  42.73866299, -22.8446092 ,  25.51355326,
         6.59536064, -46.99282262,  -7.37971781,  31.68489314,
         0.        , -13.42555403,  26.62784808,  -1.90611283,
       -23.14741099,   0.        , -16.69300239,   4.4085833 ,
        58.86311264,   6.66544378,  -1.90611283,  52.29535072,
       -46.99282262,  13.2332682 ,  28.87318929, -38.92881977,
        13.8010025 , -18.18576218,  11.80284945,   0.  

In [62]:
model1.score(xtrain, ytrain)

0.9976911764521468

In [59]:
model1.score(xtest, ytest)

0.9341398598570653

# Hyper-Parameter Tuning for Ridge

In [68]:
import numpy as np

# Dictionary to store the scores for different alpha values
alphas = {"alpha": np.arange(start=0.1, stop=60, step=0.1)}
print(alphas)

{'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
        2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
        3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
        4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
        5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
        6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
        7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
        8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
       10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
       11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
       12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
       13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
       14.4, 14.5, 14.6, 14.7, 14.8, 14.9

In [70]:
from sklearn.model_selection import GridSearchCV

# GridSearchCV is used to perform hyperparameter tuning by exhaustively searching
# through a specified parameter grid to find the best combination of parameters.

base_ridge = Ridge()
gscv_ridge = GridSearchCV(
    estimator=base_ridge,
    param_grid=alphas,
    scoring="r2",
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores for parallel processing
)
gscv_ridge.fit(xtrain, ytrain)

In [71]:
# The best alpha value found by GridSearchCV
gscv_ridge.best_params_

{'alpha': np.float64(8.2)}

In [72]:
gscv_ridge.best_score_
# The best estimator found by GridSearchCV

np.float64(0.9315295464100322)

In [76]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

In [75]:
best_ridge.score(xtrain, ytrain)

0.9800757265881885

In [77]:
best_ridge.score(xtest, ytest)

0.9344477856797779

### Lasso

In [83]:
from sklearn.linear_model import Lasso

# Lasso regression is a type of linear regression that includes a regularization term
# that can shrink some coefficients to zero, effectively performing variable selection.
model2 = Lasso(alpha=0.6)
model2.fit(xtrain, ytrain)

In [84]:
model2.score(xtrain, ytrain)

0.9968363246498101

In [85]:
model2.score(xtest, ytest)

0.911159495802797

In [86]:
# Direct Cross-Validation
from sklearn.model_selection import cross_val_score

# CrossValScore is used to evaluate the performance of a model using cross-validation.

cv_scores = cross_val_score(model2, xtrain, ytrain, cv=5, scoring="r2", n_jobs=-1)
cv_scores

array([0.91574625, 0.90840844, 0.95691878, 0.87972868, 0.90528953])

In [87]:
cv_scores.mean()

np.float64(0.9132183366657621)

In [88]:
alphas2 = {"alpha": np.arange(start=1, stop=100, step=1)}
# GridSearchCV for Lasso Regression
print(alphas2)

{'alpha': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])}


In [89]:
base_lasso = Lasso()
gscv_lasso = GridSearchCV(
    estimator=base_lasso,
    param_grid=alphas2,
    scoring="r2",
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores for parallel processing
)
gscv_lasso.fit(xtrain, ytrain)
# The best alpha value found by GridSearchCV for Lasso Regression

In [91]:
gscv_lasso.best_params_

{'alpha': np.int64(9)}

In [92]:
gscv_lasso.best_score_

np.float64(0.9451010082898754)

In [93]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

In [94]:
best_lasso.score(xtrain, ytrain)

0.9624012890992683

In [95]:
best_lasso.score(xtest, ytest)

0.9018128956346282

### On Comparing both models, Ridge is marginally better

# Step 7 - Model Evaluation

In [96]:
best_ridge

In [97]:
best_ridge.score(xtrain, ytrain)

0.9800757265881885

In [98]:
best_ridge.score(xtest, ytest)

0.9344477856797779

In [99]:
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    root_mean_squared_error,
    mean_absolute_percentage_error,
)

In [100]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)

    mae = mean_absolute_error(y, ypred)
    rmse = root_mean_squared_error(y, ypred)
    r2 = r2_score(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)

    # Print the evaluation metrics
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAPE {mape:.2%}")
    print(f"R2: {r2:.2%}")

### For Ridge

In [101]:
print("Evaluation on Training Set:")
evaluate_model(best_ridge, xtrain, ytrain)

Evaluation on Training Set:
MAE: 62.45
RMSE: 81.87
MAPE 2.09%
R2: 98.01%


In [102]:
print("Evaluation on Testing Set:")
evaluate_model(best_ridge, xtest, ytest)

Evaluation on Testing Set:
MAE: 111.04
RMSE: 154.19
MAPE 3.50%
R2: 93.44%


In [105]:
print("Evaluation on Training Set:")
evaluate_model(best_lasso, xtrain, ytrain)

Evaluation on Training Set:
MAE: 86.21
RMSE: 112.46
MAPE 2.88%
R2: 96.24%


### For Lasso

In [106]:
print("Evaluation on Testing Set:")
evaluate_model(best_lasso, xtest, ytest)

Evaluation on Testing Set:
MAE: 140.03
RMSE: 188.71
MAPE 4.35%
R2: 90.18%


### When Evaluating both best_ridge and best_lasso models, it was clearly visible that ridge is the better model, in this case

# Step 8 - Out of Sample Prediction

In [107]:
xnew = pd.read_csv("sample.csv", na_values=["", "NA"], keep_default_na=False)
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [108]:
pre

In [109]:
xnew_pre = pre.transform(xnew)
xnew_pre

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.888138,-0.875337,-0.829362,0.1135,0.360925,-0.647181,-0.649388,-0.135877,0.673908,-0.449005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.428309,-0.37572,-0.318925,-0.244313,-0.016221,-0.453339,-0.649388,-0.135877,0.532158,-0.050439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.767376,1.352122,0.966282,-0.959938,-0.770514,-1.325626,2.134145,2.054464,-0.014589,1.022624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.968608,-1.083511,-1.130155,0.471312,0.738071,-0.841022,-1.206095,0.369586,0.441034,-1.307455,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [110]:
predictions = best_ridge.predict(xnew_pre)
predictions

array([3294.89387326, 2684.77028805, 3144.5076699 , 2960.73605695,
       2260.79087162])

In [111]:
xnew["Weight_pred"] = predictions.round(2)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3294.89
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2684.77
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3144.51
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,2960.74
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2260.79


In [113]:
xnew.to_csv("RidgeResults.csv", index=False)

# Step 9 - Saving the Preprocessor and Model

In [114]:
import joblib

In [115]:
pre

In [116]:
best_ridge

In [117]:
joblib.dump(pre, "pre.joblib")

['pre.joblib']

In [118]:
joblib.dump(best_ridge, "Ridge.joblib")

['Ridge.joblib']

### Loading the Preprocessor and Model from Files

In [120]:
p = joblib.load("pre.joblib")
m = joblib.load("Ridge.joblib")

In [121]:
p

In [122]:
m