In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load the dataset
df = pd.read_csv('dataset3.csv')
df.head()

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban
2,921317019,0,18NOV51,48.0,0,11.0,"$52,881",No,$0,z_No,...,Van,yes,$0,0,No,2,$0,10.0,0,Highly Urban/ Urban
3,727598473,0,05MAR64,35.0,1,10.0,"$16,039",No,"$124,191",Yes,...,z_SUV,no,"$38,690",2,No,3,$0,10.0,0,Highly Urban/ Urban
4,450221861,0,05JUN48,51.0,0,14.0,,No,"$306,251",Yes,...,Minivan,yes,$0,0,No,0,$0,6.0,0,Highly Urban/ Urban


In [7]:
# Check for missing values in the 'INCOME' column
missing_values = df['INCOME'].isna().sum()
print(f"Missing values in 'INCOME': {missing_values}")

Missing values in 'INCOME': 570


In [9]:
# Function to clean and convert columns to numeric
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[\$,]', '', regex=True).astype(float)

# Columns to convert
columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']

for column in columns_to_convert:
    clean_numeric_column(df, column)

# After cleaning, confirm that data types are correct
print(df.dtypes)

ID              int64
KIDSDRIV        int64
BIRTH          object
AGE           float64
HOMEKIDS        int64
YOJ           float64
INCOME        float64
PARENT1        object
HOME_VAL      float64
MSTATUS        object
GENDER         object
EDUCATION      object
OCCUPATION     object
TRAVTIME        int64
CAR_USE        object
BLUEBOOK      float64
TIF             int64
CAR_TYPE       object
RED_CAR        object
OLDCLAIM      float64
CLM_FREQ        int64
REVOKED        object
MVR_PTS         int64
CLM_AMT       float64
CAR_AGE       float64
CLAIM_FLAG      int64
URBANICITY     object
dtype: object


In [11]:
# Split into numerical and categorical columns
numerical_columns = ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']

# Ensure the lists are correctly defined
print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']


In [13]:
correlation_matrix = df[numerical_columns].corr()

# Print correlation values for INCOME
print(correlation_matrix['INCOME'].sort_values(ascending=False))

INCOME      1.000000
HOME_VAL    0.578943
BLUEBOOK    0.429095
YOJ         0.293899
AGE         0.186202
TIF         0.002361
KIDSDRIV   -0.043217
TRAVTIME   -0.051853
CLM_AMT    -0.057686
MVR_PTS    -0.063495
HOMEKIDS   -0.160230
Name: INCOME, dtype: float64


In [15]:
print(df.groupby('PARENT1')['INCOME'].mean())

print(df.groupby('MSTATUS')['INCOME'].mean())

print(df.groupby('GENDER')['INCOME'].mean())

print(df.groupby('EDUCATION')['INCOME'].mean())

print(df.groupby('CAR_USE')['INCOME'].mean())

print(df.groupby('CAR_TYPE')['INCOME'].mean())

print(df.groupby('RED_CAR')['INCOME'].mean())

print(df.groupby('REVOKED')['INCOME'].mean())

print(df.groupby('URBANICITY')['INCOME'].mean())

print(df.groupby('OCCUPATION')['INCOME'].mean())

PARENT1
No     62809.576091
Yes    53363.789804
Name: INCOME, dtype: float64
MSTATUS
Yes     60185.556471
z_No    63640.134409
Name: INCOME, dtype: float64
GENDER
M      67409.138605
z_F    56547.509560
Name: INCOME, dtype: float64
EDUCATION
<High School      25642.945186
Bachelors         65355.808751
Masters           86000.752894
PhD              126402.259091
z_High School     38464.268363
Name: INCOME, dtype: float64
CAR_USE
Commercial    67138.787608
Private       58328.369003
Name: INCOME, dtype: float64
CAR_TYPE
Minivan        64754.473498
Panel Truck    96891.474785
Pickup         55532.099941
Sports Car     47683.716216
Van            80928.987254
z_SUV          51286.572112
Name: INCOME, dtype: float64
RED_CAR
no     59557.535327
yes    66532.093561
Name: INCOME, dtype: float64
REVOKED
No     61979.447048
Yes    58664.596154
Name: INCOME, dtype: float64
URBANICITY
Highly Urban/ Urban      66337.788812
z_Highly Rural/ Rural    42405.929309
Name: INCOME, dtype: float64
OCCUPAT

In [19]:
#Select the relevant numerical and categorical features for imputation
features_for_imputation = ['YOJ', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'AGE', 'PARENT1', 
                           'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 
                           'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']

# Copy the relevant columns into a new DataFrame
df_impute = df[features_for_imputation].copy()

In [21]:
# Encode categorical variables
label_encoders = {}
for col in ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 
            'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']:
    le = LabelEncoder()
    df_impute[col] = le.fit_transform(df_impute[col].astype(str))
    label_encoders[col] = le

In [23]:
# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

In [25]:
# Apply KNN imputation to the data
df_imputed = knn_imputer.fit_transform(df_impute)

In [31]:
# Replace the original INCOME column in the original DataFrame with the imputed values
df['INCOME'] = df_imputed[:, 0]

In [45]:
# Replace the original missing column in the original DataFrame with the imputed values
df['YOJ'] = df_imputed[:, 0]
df['INCOME'] = df_imputed[:, 0]
df['HOME_VAL'] = df_imputed[:, 0]
df['AGE'] = df_imputed[:, 0]

In [33]:
# Check if missing values are imputed
print(df['INCOME'].isnull().sum())

0


In [49]:
# Save the imputed dataset to a CSV file
df.to_csv('KNNimputed_INCOME_dataset3.csv', index=False)

In [35]:
# Create a copy of the original data and introduce random missing values
df_test = df.copy()

# Define the fraction of data to randomly remove for testing (e.g., 20%)
missing_fraction = 0.2
np.random.seed(42)  # For reproducibility

# Randomly select 20% of the non-missing INCOME values to be set as NaN for testing
non_null_indices = df_test[df_test['INCOME'].notnull()].index
test_indices = np.random.choice(non_null_indices, size=int(missing_fraction * len(non_null_indices)), replace=False)
original_values = df_test.loc[test_indices, 'INCOME']  # Store the original values for comparison
df_test.loc[test_indices, 'INCOME'] = np.nan  # Introduce missing values

In [37]:
# Impute the missing values using KNN (reusing the previous code)
# Make sure you use the same feature selection and encoding as before
df_impute_test = df_test[features_for_imputation].copy()

# Apply the label encoding for categorical features
for col in ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 
            'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']:
    le = label_encoders[col]
    df_impute_test[col] = le.transform(df_impute_test[col].astype(str))

# Apply KNN imputer to the test data
df_imputed_test = knn_imputer.fit_transform(df_impute_test)

In [38]:
# Compare the imputed values with the original values
# Get the imputed INCOME values
imputed_values = df_imputed_test[test_indices, 0]

In [41]:
# Calculate the mean squared error (or any other metric) between original and imputed values
mse = mean_squared_error(original_values, imputed_values)
print(f'Mean Squared Error for INCOME imputation: {mse}')

# Mean Absolute Error (MAE)
mae = mean_absolute_error(original_values, imputed_values)
print(f'Mean Absolute Error for INCOME imputation: {mae}')

# R-squared (R²)
r2 = r2_score(original_values, imputed_values)
print(f'R-squared (R²): {r2}')

Mean Squared Error for INCOME imputation: 0.48945631067961165
Mean Absolute Error for INCOME imputation: 0.11466019417475727
R-squared (R²): 0.9692846461532667


MSE of 0.489 suggests that, on average, the value is relatively low meaning the imputation predictions are close to the true values, with small errors.                  
MAE of 0.115 suggests that, on average, the predictions deviate from the actual values by about 0.115 units. This is a low error, signifying good predictive performance.                    
R² of 0.969 indicates that about 96.9% of the variance in the actual INCOME values is explained by the imputed values. This suggests a highly accurate imputation model.