# Imputing Car_Age using Support Vector Regression

In [83]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Explore the Dataset

This section aims to load and inspect dataset 3 and check for columns with missing values that could be used for support vector reggession (SVR). As SVR is for handling continous numerical variables the target column has to be numeric. From the columns with 'income', 'car_age', 'home_val', 'YOJ', and 'age' all being numeric, for this method 'car_age' was chosen as the candidate for SVR imputation.

In [35]:
# Load the dataset
df = pd.read_csv('dataset3.csv')
df.head()

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban
2,921317019,0,18NOV51,48.0,0,11.0,"$52,881",No,$0,z_No,...,Van,yes,$0,0,No,2,$0,10.0,0,Highly Urban/ Urban
3,727598473,0,05MAR64,35.0,1,10.0,"$16,039",No,"$124,191",Yes,...,z_SUV,no,"$38,690",2,No,3,$0,10.0,0,Highly Urban/ Urban
4,450221861,0,05JUN48,51.0,0,14.0,,No,"$306,251",Yes,...,Minivan,yes,$0,0,No,0,$0,6.0,0,Highly Urban/ Urban


In [37]:
# Count of Null Values in each column
df.isnull().sum(axis = 0)

ID              0
KIDSDRIV        0
BIRTH           0
AGE             7
HOMEKIDS        0
YOJ           548
INCOME        570
PARENT1         0
HOME_VAL      575
MSTATUS         0
GENDER          0
EDUCATION       0
OCCUPATION    665
TRAVTIME        0
CAR_USE         0
BLUEBOOK        0
TIF             0
CAR_TYPE        0
RED_CAR         0
OLDCLAIM        0
CLM_FREQ        0
REVOKED         0
MVR_PTS         0
CLM_AMT         0
CAR_AGE       639
CLAIM_FLAG      0
URBANICITY      0
dtype: int64

In [39]:
# Check for missing values in the 'CAR_AGE' column
missing_values = df['CAR_AGE'].isna().sum()
print(f"Missing values in 'CAR_AGE': {missing_values}")

Missing values in 'CAR_AGE': 639


## Clean the Dataset

Before training the model the dataset is prepared by handling the missing values. This is done by converting the non-numeric values to numeric values as the SVR model works with numeric data.

In [88]:
# Function to clean and convert columns to numeric
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[\$,]', '', regex=True).astype(float)

# Columns to convert
columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']

for column in columns_to_convert:
    clean_numeric_column(df, column)

# After cleaning, confirm that data types are correct
print(df.dtypes)

ID              int64
KIDSDRIV        int64
BIRTH          object
AGE           float64
HOMEKIDS        int64
YOJ           float64
INCOME        float64
PARENT1        object
HOME_VAL      float64
MSTATUS        object
GENDER         object
EDUCATION      object
OCCUPATION     object
TRAVTIME        int64
CAR_USE        object
BLUEBOOK      float64
TIF             int64
CAR_TYPE       object
RED_CAR        object
OLDCLAIM      float64
CLM_FREQ        int64
REVOKED        object
MVR_PTS         int64
CLM_AMT       float64
CAR_AGE       float64
CLAIM_FLAG      int64
URBANICITY     object
dtype: object


In [48]:
# Split into numerical and categorical columns
numerical_columns = ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']

# Ensure the lists are correctly defined
print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']


## Train the Dataset

Prepare the dataset for modelling by separating the data with missing and complete 'CAR_AGE' and splitting the data into 'x_train' and 'y_train'. Then define the preprocessing steps for numerical and categorical data and create and fit a pipeline with the training data.

In [51]:
# Separate rows with and without missing 'CAR_AGE'
df_train = dataset3[dataset3['CAR_AGE'].notna()].copy()
df_test = dataset3[dataset3['CAR_AGE'].isna()].copy()

# Separate features and target variable for the rows where 'CAR_AGE' is not missing
X_train = df_train.drop(columns=['CAR_AGE'])
y_train = df_train['CAR_AGE']

In [53]:
# Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Impute missing numerical values
            ('scaler', StandardScaler())  # Standardize numerical columns
        ]), numerical_columns),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical columns
        ]), categorical_columns)
    ]
)

In [55]:
# Define a pipeline with the preprocessor and SVR model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.1))
])

# Fit the pipeline on the training data (rows with complete 'CAR_AGE')
pipeline.fit(X_train, y_train)

## Impute the Missing Data

Using the trained pipeline the missing values are predicted and combined with the original dataset 3

In [69]:
# Prepare the test data (rows where 'CAR_AGE' is missing)
X_test = df_test.drop(columns=['CAR_AGE'])

In [71]:
# Predict the missing 'CAR_AGE' values using the trained pipeline
predicted_car_age = pipeline.predict(X_test)

In [73]:
# Use .loc to set the predicted values in the original DataFrame
df.loc[df['CAR_AGE'].isna(), 'CAR_AGE'] = predicted_car_age

In [75]:
# Save the imputed dataset to a CSV file
df.to_csv('SVRimputed_carage_dataset3.csv', index=False)

## New Dataset

As seen in the missing values in the new dataset the missing values for 'car_age' are 0 meaning it was been successfully imputed with SVR.

In [78]:
# Check for any remaining missing values
missing_values_after = df.isna().sum()
print("Missing values after imputation:")
print(missing_values_after[missing_values_after > 0])

Missing values after imputation:
AGE             7
YOJ           548
INCOME        570
HOME_VAL      575
OCCUPATION    665
dtype: int64


## Accuracy Test

Run accuracy tests to see if SVR accurately imputed the missing values

In [92]:
# Replace with actual test set if you have one
y_true = df_train['CAR_AGE']
y_pred = pipeline.predict(X_train)  # Predict on the same training data for simplicity

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_true, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_true, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# R-squared (R²)
r2 = r2_score(y_true, y_pred)
print(f'R-squared (R²): {r2}')


Mean Absolute Error (MAE): 3.0431850637074747
Mean Squared Error (MSE): 15.086744623070308
R-squared (R²): 0.5379472214369567


The results show that MAE is 3.043 meaning that the predictions are around 3.04 years from the actual values.                        
The results show that MSE is 15.087 meaning that the average square difference between predicted and actual 'car_age' is 15.087 squared thus some predictions may be signicantlly off.                             
The results show that r2 is about 53.8% meaning the model captures a little more than half the variance hence the model has done a **moderate** job but can still be improved through fine-tuning, or feature engineering.