# Automated Preprocessing with Pipelines

In [1]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 1 - Data Ingestion

In [4]:
import pandas as pd

df = pd.read_csv("Cars93.csv", na_values=["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


### Target Feature - Weight

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  94 non-null     int64  
 1   Manufacturer        94 non-null     object 
 2   Model               94 non-null     object 
 3   Type                94 non-null     object 
 4   Min.Price           94 non-null     float64
 5   Price               94 non-null     float64
 6   Max.Price           94 non-null     float64
 7   MPG.city            94 non-null     int64  
 8   MPG.highway         94 non-null     int64  
 9   AirBags             90 non-null     object 
 10  DriveTrain          94 non-null     object 
 11  Cylinders           94 non-null     object 
 12  EngineSize          94 non-null     float64
 13  Horsepower          94 non-null     int64  
 14  RPM                 94 non-null     int64  
 15  Rev.per.mile        94 non-null     int64  
 16  Man.trans.

In [8]:
m = df.isna().sum()

In [9]:
m[m > 0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [10]:
df.duplicated().sum()

np.int64(1)

In [11]:
df.drop_duplicates(keep="first").reset_index(drop=True)

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,89,Volkswagen,Eurovan,Van,16.6,19.7,22.7,17,21,,...,7,187,115,72,38,34.0,,3960,non-USA,Volkswagen Eurovan
89,90,Volkswagen,Passat,Compact,17.6,20.0,22.4,21,30,,...,5,180,103,67,35,31.5,14.0,2985,non-USA,Volkswagen Passat
90,91,Volkswagen,Corrado,Sporty,22.9,23.3,23.7,18,25,,...,4,159,97,66,36,26.0,15.0,2810,non-USA,Volkswagen Corrado
91,92,Volvo,240,Compact,21.8,22.7,23.5,21,28,Driver only,...,5,190,104,67,37,29.5,14.0,2985,non-USA,Volvo 240


In [12]:
df.shape

(94, 28)

# Step 3 - Separate X & Y

In [13]:
X = df.drop(columns=["id", "Weight"])
Y = df[["Weight"]]
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [14]:
Y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


# Step 4 - Apply Preprocessing

In [15]:
X.columns

Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price',
       'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Origin', 'Make'],
      dtype='object')

In [16]:
cat = list(X.columns[X.dtypes == "object"])

In [17]:
cat

['Manufacturer',
 'Model',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin',
 'Make']

In [19]:
con = list(X.columns[X.dtypes != "object"])
con

['Min.Price',
 'Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room']

### Pipeline Creation

In [27]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# make_pipeline is used to create a pipeline of transformations and estimators
# SimpleImputer is used to handle missing values in the dataset
# StandardScaler is used to standardize the features by removing the mean and scaling to unit variance
# OneHotEncoder is used to convert categorical variables into a format that can be provided to ML algorithms

In [28]:
# Create a preprocessing pipeline for categorical features, aka, numerical features
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [None]:
# Create a preprocessing pipeline for categorical features
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
)

In [None]:
# Combine the numerical and categorical pipelines into a single ColumnTransformer
# numerical pipeline will strictly apply to continuous features
# categorical pipeline will strictly apply to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, con),
        ("cat", cat_pipe, cat),
    ]
).set_output(transform="pandas")
# Fit the preprocessor to the data and transform the features
# set_output(transform="pandas") ensures the output is a DataFrame

In [34]:
X_pre = preprocessor.fit_transform(X)
X_pre.head()
# The preprocessor is a ColumnTransformer that applies different transformations to different columns
# fit_transform is used to fit the preprocessor to the data and transform the features

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,-0.482591,-0.373436,-0.28084,0.468246,0.358751,-0.833407,-0.07309,1.682139,1.115958,-1.050235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.3989,1.508761,1.541987,-0.789329,-0.777967,0.525656,1.084859,0.347784,-0.006346,0.420485,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.017985,1.006842,0.955751,-0.430022,-0.588514,0.137352,0.544483,0.347784,-0.117566,0.083445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.583587,1.906114,2.082423,-0.609675,-0.588514,0.137352,0.544483,0.347784,0.398088,1.370325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.764041,1.100952,1.312989,-0.070715,0.169298,0.816883,1.239252,0.681373,0.418309,1.370325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 5 - Train Test Split

In [36]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets

In [37]:
xtrain, xtest, ytrain, ytest = train_test_split(
    X_pre, Y, test_size=0.2, random_state=42
)  # random_state is set for reproducibility

In [39]:
xtrain.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
65,-0.043961,-0.038823,-0.033521,-0.968982,-1.156873,0.331504,0.1392,-0.819777,-0.552332,1.033285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,-0.274819,-0.331609,-0.354119,-0.789329,-1.156873,1.108111,0.505884,-0.819777,-1.310646,1.033285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,-0.332534,-0.331609,-0.317479,0.108939,0.358751,-0.445103,-0.652065,-0.152599,0.458753,-0.039115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,-0.909678,-0.875355,-0.821276,1.007206,1.684922,-0.736331,-1.134544,-0.486188,-0.390558,-1.172795,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,-1.175165,-1.262251,-1.279272,1.546167,0.737657,-1.318786,-1.559126,-0.486188,1.641722,-2.030715,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
ytrain.head()

Unnamed: 0,Weight
65,4100
15,3715
68,2890
78,2495
30,1845


In [41]:
xtest.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
40,-0.009333,0.034373,0.076398,0.288592,0.358751,-0.348027,0.312893,0.848167,1.045182,-0.222955,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,-1.059736,-1.074032,-1.031954,1.18686,0.737657,-1.124635,-0.99945,1.181756,1.914715,-1.050235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,-0.055504,-0.038823,-0.015202,-0.789329,-0.96742,0.331504,0.216397,-0.486188,-0.198452,0.910725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,-1.025107,-1.094945,-1.096074,1.546167,2.253281,-1.027559,-1.346835,0.514578,1.601279,-1.050235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,-0.482591,-0.373436,-0.28084,0.468246,0.358751,-0.833407,-0.07309,1.682139,1.115958,-1.050235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
ytest.head()

Unnamed: 0,Weight
40,2865
22,2270
55,3735
72,2350
0,2705


# Step 6 - Train the model

In [43]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model

In [44]:
model = LinearRegression()

In [45]:
model.fit(xtrain, ytrain)  # Fit the model to the training data
# The fit method trains the model using the training data

# Step 7 - Model Evaluation

In [46]:
model.score(xtest, ytest)  # Evaluate the model on the test data
# The score method returns the coefficient of determination R^2 of the prediction

0.9224328409227927

In [48]:
model.score(xtrain, ytrain)  # Evaluate the model on the training data
# The score method returns the coefficient of determination R^2 of the prediction on training data

1.0

In [49]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
    mean_absolute_percentage_error,
)

In [53]:
def evaluate_model(model, x, y):
    # This function evaluates the model's performance using various metrics
    # It calculates Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R-squared (R2)
    ypred = model.predict(x)

    mae = mean_absolute_error(y, ypred)
    mse = mean_squared_error(y, ypred)
    rmse = root_mean_squared_error(y, ypred)
    r2 = r2_score(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)

    # Print the evaluation metrics
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (R2): {r2:.2%}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2%}")

In [54]:
print("Evaluation on Training Set:")
evaluate_model(model, xtrain, ytrain)

Evaluation on Training Set:
Mean Absolute Error (MAE): 0.00
Mean Squared Error (MSE): 0.00
Root Mean Squared Error (RMSE): 0.00
R-squared (R2): 100.00%
Mean Absolute Percentage Error (MAPE): 0.00%


In [55]:
print("\nEvaluation on Testing Set:")
evaluate_model(model, xtest, ytest)


Evaluation on Testing Set:
Mean Absolute Error (MAE): 118.05
Mean Squared Error (MSE): 25612.32
Root Mean Squared Error (RMSE): 160.04
R-squared (R2): 92.24%
Mean Absolute Percentage Error (MAPE): 4.03%


In [None]:
ypred_test = model.predict(xtest)
ypred_test[0:5]
# [0:5] is used to display the first 5 predictions

array([[2893.36952617],
       [2336.08293672],
       [3762.83872326],
       [2120.11264062],
       [2705.        ]])

In [57]:
ytest.head()

Unnamed: 0,Weight
40,2865
22,2270
55,3735
72,2350
0,2705


### From above results, R2 Score in test is greater than 0.8, thus it is a good model

# Step 8 - Out of Sample Prediction

In [58]:
xnew = pd.read_csv("sample.csv", na_values=["", "NA"], keep_default_na=False)

In [59]:
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [60]:
m1 = xnew.isna().sum()
m1[m1 > 0]
# Check for missing values in the new data

AirBags           1
Rear.seat.room    1
Luggage.room      1
dtype: int64

In [None]:
# Checking the preprocessor
preprocessor

In [None]:
# fit is done only once, so we can use the preprocessor to transform the new data
xnew_pre = preprocessor.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,1.583587,1.906114,2.082423,-0.609675,-0.588514,0.137352,0.544483,0.347784,0.398088,-0.498715,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.886592,-0.875355,-0.830436,0.108939,0.358751,-0.639255,-0.652065,-0.152599,0.66097,-0.437435,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.424877,-0.373436,-0.317479,-0.250368,-0.020155,-0.445103,-0.652065,-0.152599,0.519418,-0.039115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.779816,1.362368,0.974071,-0.968982,-0.777967,-1.318786,2.146313,2.015728,-0.026568,1.033285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.967393,-1.084488,-1.132713,0.468246,0.737657,-0.833407,-1.211741,0.347784,0.42842,-1.295355,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [64]:
predictions = model.predict(xnew_pre)
predictions

array([[3262.06852985],
       [2575.        ],
       [3195.        ],
       [2895.        ],
       [2240.        ]])

### Save above in dataframe

In [65]:
xnew["Weight_pred"] = predictions
xnew = xnew.round(2)

In [66]:
xnew.to_csv("Results.csv", index=False)

# Step 9 - Save the Preprocessor and Model

In [67]:
preprocessor

In [68]:
model

In [69]:
!uv add joblib

[2mResolved [1m53 packages[0m [2min 18ms[0m[0m
[2mAudited [1m47 packages[0m [2min 0.72ms[0m[0m


In [70]:
import joblib

# Save the preprocessor and model using joblib

In [71]:
joblib.dump(preprocessor, "preprocessor.joblib")
joblib.dump(model, "model.joblib")

['model.joblib']

### Loading the Preprocessor and Model from Files

In [72]:
p = joblib.load("preprocessor.joblib")
m = joblib.load("model.joblib")
# Load the preprocessor and model from the saved files

In [73]:
p

In [74]:
m

In [75]:
m.score(xtest, ytest)  # Evaluate the loaded model on the test data

0.9224328409227927