# Clean Dataset 3 'HOME_VAL' Column using KNN

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## EXPLORE THE DATASET

In [2]:
# Load the dataset
df = pd.read_csv('dataset3.csv')
df.head()

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban
2,921317019,0,18NOV51,48.0,0,11.0,"$52,881",No,$0,z_No,...,Van,yes,$0,0,No,2,$0,10.0,0,Highly Urban/ Urban
3,727598473,0,05MAR64,35.0,1,10.0,"$16,039",No,"$124,191",Yes,...,z_SUV,no,"$38,690",2,No,3,$0,10.0,0,Highly Urban/ Urban
4,450221861,0,05JUN48,51.0,0,14.0,,No,"$306,251",Yes,...,Minivan,yes,$0,0,No,0,$0,6.0,0,Highly Urban/ Urban


In [7]:
# Check for missing values in the 'HOME_VAL' column
missing_values = df['HOME_VAL'].isna().sum()
print(f"Missing values in 'HOME_VAL': {missing_values}")

Missing values in 'HOME_VAL': 575


## CLEAN THE DATASET

In [9]:
# Function to clean and convert columns to numeric
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[\$,]', '', regex=True).astype(float)

# Columns to convert
columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']

for column in columns_to_convert:
    clean_numeric_column(df, column)

# After cleaning, confirm that data types are correct
print(df.dtypes)

ID              int64
KIDSDRIV        int64
BIRTH          object
AGE           float64
HOMEKIDS        int64
YOJ           float64
INCOME        float64
PARENT1        object
HOME_VAL      float64
MSTATUS        object
GENDER         object
EDUCATION      object
OCCUPATION     object
TRAVTIME        int64
CAR_USE        object
BLUEBOOK      float64
TIF             int64
CAR_TYPE       object
RED_CAR        object
OLDCLAIM      float64
CLM_FREQ        int64
REVOKED        object
MVR_PTS         int64
CLM_AMT       float64
CAR_AGE       float64
CLAIM_FLAG      int64
URBANICITY     object
dtype: object


In [11]:
# Split into numerical and categorical columns
numerical_columns = ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']

# Ensure the lists are correctly defined
print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']


## CHECK FOR CORRELATIONS IN THE DATASET

In [13]:
correlation_matrix = df[numerical_columns].corr()

# Print correlation values for HOME_VAL
print(correlation_matrix['HOME_VAL'].sort_values(ascending=False))

HOME_VAL    1.000000
INCOME      0.578943
YOJ         0.272198
BLUEBOOK    0.261962
AGE         0.209600
TIF         0.006768
KIDSDRIV   -0.015447
TRAVTIME   -0.035139
CLM_AMT    -0.091987
MVR_PTS    -0.092460
HOMEKIDS   -0.107483
Name: HOME_VAL, dtype: float64


In [15]:
print(df.groupby('PARENT1')['HOME_VAL'].mean())

print(df.groupby('MSTATUS')['HOME_VAL'].mean())

print(df.groupby('GENDER')['HOME_VAL'].mean())

print(df.groupby('EDUCATION')['HOME_VAL'].mean())

print(df.groupby('CAR_USE')['HOME_VAL'].mean())

print(df.groupby('CAR_TYPE')['HOME_VAL'].mean())

print(df.groupby('RED_CAR')['HOME_VAL'].mean())

print(df.groupby('REVOKED')['HOME_VAL'].mean())

print(df.groupby('URBANICITY')['HOME_VAL'].mean())

print(df.groupby('OCCUPATION')['HOME_VAL'].mean())

PARENT1
No     167079.628061
Yes     71209.816327
Name: HOME_VAL, dtype: float64
MSTATUS
Yes     202361.132043
z_No     82679.720165
Name: HOME_VAL, dtype: float64
GENDER
M      165575.233786
z_F    145059.032061
Name: HOME_VAL, dtype: float64
EDUCATION
<High School      93695.631726
Bachelors        166469.271743
Masters          195603.522657
PhD              245208.278281
z_High School    116936.443927
Name: HOME_VAL, dtype: float64
CAR_USE
Commercial    160285.722191
Private       151196.405221
Name: HOME_VAL, dtype: float64
CAR_TYPE
Minivan        160078.843482
Panel Truck    212815.958491
Pickup         144873.093750
Sports Car     132559.549550
Van            185127.563006
z_SUV          137454.942512
Name: HOME_VAL, dtype: float64
RED_CAR
no     152008.251372
yes    160728.800713
Name: HOME_VAL, dtype: float64
REVOKED
No     156689.487516
Yes    139069.719064
Name: HOME_VAL, dtype: float64
URBANICITY
Highly Urban/ Urban      162108.249195
z_Highly Rural/ Rural    124579.491353
