# Clean Dataset 3 'CAR_AGE' using KNN

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## EXPLORE THE DATASET

In [2]:
# Load the dataset
df = pd.read_csv('dataset3.csv')
df.head()

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban
2,921317019,0,18NOV51,48.0,0,11.0,"$52,881",No,$0,z_No,...,Van,yes,$0,0,No,2,$0,10.0,0,Highly Urban/ Urban
3,727598473,0,05MAR64,35.0,1,10.0,"$16,039",No,"$124,191",Yes,...,z_SUV,no,"$38,690",2,No,3,$0,10.0,0,Highly Urban/ Urban
4,450221861,0,05JUN48,51.0,0,14.0,,No,"$306,251",Yes,...,Minivan,yes,$0,0,No,0,$0,6.0,0,Highly Urban/ Urban


In [5]:
# Check for missing values in the 'CAR_AGE' column
missing_values = df['CAR_AGE'].isna().sum()
print(f"Missing values in 'CAR_AGE': {missing_values}")

Missing values in 'CAR_AGE': 639


## CLEAN THE DATASET

In [7]:
# Function to clean and convert columns to numeric
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[\$,]', '', regex=True).astype(float)

# Columns to convert
columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']

for column in columns_to_convert:
    clean_numeric_column(df, column)

# After cleaning, confirm that data types are correct
print(df.dtypes)

ID              int64
KIDSDRIV        int64
BIRTH          object
AGE           float64
HOMEKIDS        int64
YOJ           float64
INCOME        float64
PARENT1        object
HOME_VAL      float64
MSTATUS        object
GENDER         object
EDUCATION      object
OCCUPATION     object
TRAVTIME        int64
CAR_USE        object
BLUEBOOK      float64
TIF             int64
CAR_TYPE       object
RED_CAR        object
OLDCLAIM      float64
CLM_FREQ        int64
REVOKED        object
MVR_PTS         int64
CLM_AMT       float64
CAR_AGE       float64
CLAIM_FLAG      int64
URBANICITY     object
dtype: object


In [13]:
# Split into numerical and categorical columns
numerical_columns = ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT', 'CAR_AGE']
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']

# Ensure the lists are correctly defined
print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT', 'CAR_AGE']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']


## CHECK FOR CORRELATIONS IN THE DATASET

In [15]:
correlation_matrix = df[numerical_columns].corr()

# Print correlation values for CAR_AGE
print(correlation_matrix['CAR_AGE'].sort_values(ascending=False))

CAR_AGE     1.000000
INCOME      0.417301
HOME_VAL    0.223198
BLUEBOOK    0.194366
AGE         0.178592
YOJ         0.064685
TIF         0.007897
TRAVTIME   -0.033290
MVR_PTS    -0.033898
KIDSDRIV   -0.059629
CLM_AMT    -0.066060
HOMEKIDS   -0.159008
Name: CAR_AGE, dtype: float64


In [17]:
# Group by categorical columns and print mean CAR_AGE
for col in categorical_columns:
    print(df.groupby(col)['CAR_AGE'].mean())

PARENT1
No     8.444458
Yes    7.317928
Name: CAR_AGE, dtype: float64
MSTATUS
Yes     8.134228
z_No    8.545431
Name: CAR_AGE, dtype: float64
GENDER
M      8.456444
z_F    8.162795
Name: CAR_AGE, dtype: float64
EDUCATION
<High School      3.433777
Bachelors         8.832829
Masters          14.103236
PhD              13.851852
z_High School     4.496224
Name: CAR_AGE, dtype: float64
CAR_USE
Commercial    7.810894
Private       8.584909
Name: CAR_AGE, dtype: float64
CAR_TYPE
Minivan         8.682213
Panel Truck    10.143939
Pickup          7.639640
Sports Car      7.693915
Van             8.980324
z_SUV           7.832903
Name: CAR_AGE, dtype: float64
RED_CAR
no     8.237229
yes    8.448066
Name: CAR_AGE, dtype: float64
REVOKED
No     8.341158
Yes    7.990725
Name: CAR_AGE, dtype: float64
URBANICITY
Highly Urban/ Urban      8.781003
z_Highly Rural/ Rural    6.383350
Name: CAR_AGE, dtype: float64
OCCUPATION
Clerical          4.809019
Doctor           13.996633
Home Maker        7.577121


## PREPARE THE DATASET

In [19]:
#Select the relevant numerical and categorical features for imputation
features_for_imputation = ['YOJ', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'AGE', 'PARENT1', 
                           'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 
                           'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION', 'CAR_AGE']

# Copy the relevant columns into a new DataFrame
df_impute = df[features_for_imputation].copy()

In [21]:
# Encode categorical variables
label_encoders = {}
for col in ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 
            'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']:
    le = LabelEncoder()
    df_impute[col] = le.fit_transform(df_impute[col].astype(str))
    label_encoders[col] = le

## TRAIN THE DATASET

In [23]:
# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

In [25]:
# Apply KNN imputation to the data
df_imputed = knn_imputer.fit_transform(df_impute)

In [26]:
# Replace the original CAR_AGE column in the original DataFrame with the imputed values
df['CAR_AGE'] = df_imputed[:, 0]

In [29]:
# Check if missing values are imputed
print(df['CAR_AGE'].isnull().sum())

0


In [49]:
# Save the imputed dataset to a CSV file
df.to_csv('KNNimputed_CARAGE_dataset3.csv', index=False)

## ACCURACY

In [31]:
# Create a copy of the original data and introduce random missing values
df_test = df.copy()

# Define the fraction of data to randomly remove for testing (e.g., 20%)
missing_fraction = 0.2
np.random.seed(42)  # For reproducibility

# Randomly select 20% of the non-missing CAR_AGE values to be set as NaN for testing
non_null_indices = df_test[df_test['CAR_AGE'].notnull()].index
test_indices = np.random.choice(non_null_indices, size=int(missing_fraction * len(non_null_indices)), replace=False)
original_values = df_test.loc[test_indices, 'CAR_AGE']  # Store the original values for comparison
df_test.loc[test_indices, 'CAR_AGE'] = np.nan  # Introduce missing values

In [33]:
# Impute the missing values using KNN (reusing the previous code)
# Make sure you use the same feature selection and encoding as before
df_impute_test = df_test[features_for_imputation].copy()

# Apply the label encoding for categorical features
for col in ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 
            'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']:
    le = label_encoders[col]
    df_impute_test[col] = le.transform(df_impute_test[col].astype(str))

# Apply KNN imputer to the test data
df_imputed_test = knn_imputer.fit_transform(df_impute_test)

In [34]:
# Compare the imputed values with the original values
# Get the imputed CAR_AGE values
imputed_values = df_imputed_test[test_indices, 0]

In [37]:
# Calculate the mean squared error (or any other metric) between original and imputed values
mse = mean_squared_error(original_values, imputed_values)
print(f'Mean Squared Error for CAR_AGE imputation: {mse}')

# Mean Absolute Error (MAE)
mae = mean_absolute_error(original_values, imputed_values)
print(f'Mean Absolute Error for CAR_AGE imputation: {mae}')

# R-squared (R²)
r2 = r2_score(original_values, imputed_values)
print(f'R-squared (R²): {r2}')

Mean Squared Error for CAR_AGE imputation: 0.0024660194174757274
Mean Absolute Error for CAR_AGE imputation: 0.002038834951456311
R-squared (R²): 0.9998452503032308


MSE of 0.002 suggests that, on average, the value is relatively low meaning the imputation predictions are close to the true values, with small errors.                  
MAE of  0.002 suggests that, on average, the predictions deviate from the actual values by about 0.115 units. This is a low error, signifying good predictive performance.                    
R² of 0.999 indicates that about 99.98% of the variance in the actual INCOME values is explained by the imputed values. This suggests a highly accurate imputation model.