In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE


## EXPLORE THE DATASET

In [3]:
# Load the dataset
df = pd.read_csv('dataset3.csv')
df.head()

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban
2,921317019,0,18NOV51,48.0,0,11.0,"$52,881",No,$0,z_No,...,Van,yes,$0,0,No,2,$0,10.0,0,Highly Urban/ Urban
3,727598473,0,05MAR64,35.0,1,10.0,"$16,039",No,"$124,191",Yes,...,z_SUV,no,"$38,690",2,No,3,$0,10.0,0,Highly Urban/ Urban
4,450221861,0,05JUN48,51.0,0,14.0,,No,"$306,251",Yes,...,Minivan,yes,$0,0,No,0,$0,6.0,0,Highly Urban/ Urban


In [4]:
# Check for missing values in the dataset
df.isna().sum()

ID              0
KIDSDRIV        0
BIRTH           0
AGE             7
HOMEKIDS        0
YOJ           548
INCOME        570
PARENT1         0
HOME_VAL      575
MSTATUS         0
GENDER          0
EDUCATION       0
OCCUPATION    665
TRAVTIME        0
CAR_USE         0
BLUEBOOK        0
TIF             0
CAR_TYPE        0
RED_CAR         0
OLDCLAIM        0
CLM_FREQ        0
REVOKED         0
MVR_PTS         0
CLM_AMT         0
CAR_AGE       639
CLAIM_FLAG      0
URBANICITY      0
dtype: int64

## CLEAN THE DATASET

In [6]:
# Function to clean and convert columns to numeric
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[\$,]', '', regex=True).astype(float)

# Columns to convert
columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']

for column in columns_to_convert:
    clean_numeric_column(df, column)

# After cleaning, confirm that data types are correct
print(df.dtypes)

ID              int64
KIDSDRIV        int64
BIRTH          object
AGE           float64
HOMEKIDS        int64
YOJ           float64
INCOME        float64
PARENT1        object
HOME_VAL      float64
MSTATUS        object
GENDER         object
EDUCATION      object
OCCUPATION     object
TRAVTIME        int64
CAR_USE        object
BLUEBOOK      float64
TIF             int64
CAR_TYPE       object
RED_CAR        object
OLDCLAIM      float64
CLM_FREQ        int64
REVOKED        object
MVR_PTS         int64
CLM_AMT       float64
CAR_AGE       float64
CLAIM_FLAG      int64
URBANICITY     object
dtype: object


In [7]:
# Split into numerical and categorical columns
numerical_columns = ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']

# Filter out missing columns
numerical_columns = [col for col in numerical_columns if col in df.columns]
categorical_columns = [col for col in categorical_columns if col in df.columns]

# Ensure the lists are correctly defined
print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'TRAVTIME', 'TIF', 'MVR_PTS', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'CLM_AMT']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']


## PREPARE THE DATASET

In [9]:
#Select the relevant numerical and categorical features for imputation
features_for_imputation = ['YOJ', 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'AGE', 'PARENT1', 
                           'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 'CAR_TYPE', 
                           'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION', 'CAR_AGE']

# Copy the relevant columns into a new DataFrame
df_impute = df[features_for_imputation].copy()

In [10]:
# Encode categorical variables
label_encoders = {}
for col in ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE', 
            'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY', 'OCCUPATION']:
    le = LabelEncoder()
    df_impute[col] = le.fit_transform(df_impute[col].astype(str))
    label_encoders[col] = le

## TRAIN KNN MODEL

In [12]:
# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

In [13]:
# Apply KNN imputation to the data
df_imputed = knn_imputer.fit_transform(df_impute)

## IMPUTE AGE COLUMN

In [15]:
# Replace the original AGE column in the original DataFrame with the imputed values
df['AGE'] = df_imputed[:, 0]

In [16]:
# Check if missing values are imputed
print(df['AGE'].isnull().sum())

0


## IMPUTE YOJ COLUMN

In [18]:
# Split into complete and missing data
df_complete_YOJ = df[df['YOJ'].notnull()].copy()
df_missing_YOJ = df[df['YOJ'].isnull()].copy()

# Separate the target variable from features
y_complete_YOJ = df_complete_YOJ['YOJ']
X_complete_YOJ = df_complete_YOJ.drop(columns=['YOJ'])
X_missing_YOJ = df_missing_YOJ.drop(columns=['YOJ'])

# Filter out columns not in X_complete
numerical_columns_YOJ = [col for col in numerical_columns if col in X_complete_YOJ.columns]
categorical_columns_YOJ = [col for col in categorical_columns if col in X_complete_YOJ.columns]

# Prepare the column transformer to handle categorical and numerical data
preprocessor_YOJ = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_columns_YOJ),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns_YOJ)
    ]
)

# Define the pipeline with RandomForestRegressor
pipeline_impute_YOJ = Pipeline(steps=[
    ('preprocessor', preprocessor_YOJ),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [19]:
# Define parameter grid for hyperparameter tuning
param_grid_YOJ = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30]
}

# Perform GridSearchCV
grid_search_YOJ = GridSearchCV(pipeline_impute_YOJ, param_grid_YOJ, cv=5, scoring='neg_mean_squared_error')
grid_search_YOJ.fit(X_complete_YOJ, y_complete_YOJ)

# Use the best model to predict missing values
best_pipeline_YOJ = grid_search_YOJ.best_estimator_
y_predicted_YOJ = best_pipeline_YOJ.predict(X_missing_YOJ)

# Fill missing values
df.loc[df['YOJ'].isnull(), 'YOJ'] = y_predicted_YOJ

# Check if missing values are filled
print(f"Remaining missing values in 'YOJ': {df['YOJ'].isnull().sum()}")

Remaining missing values in 'YOJ': 0


## IMPUTE INCOME COLUMN

In [21]:
# Replace the original INCOME column in the original DataFrame with the imputed values
df['INCOME'] = df_imputed[:, 0]

In [22]:
# Check if missing values are imputed
print(df['INCOME'].isnull().sum())

0


## IMPUTE HOME_VAL COLUMN

In [24]:
# Replace the original HOME_VAL column in the original DataFrame with the imputed values
df['HOME_VAL'] = df_imputed[:, 0]

In [25]:
# Check if missing values are imputed
print(df['HOME_VAL'].isnull().sum())

0


## IMPUTE OCCUPTATION COLUMN

In [27]:
# Split the data into rows with known and unknown 'OCCUPATION'
train_data_OCCUPATION = df_impute[df['OCCUPATION'].notnull()]
test_data_OCCUPATION = df_impute[df['OCCUPATION'].isnull()]

# Ensure there are rows in X_test for prediction
if test_data_OCCUPATION.empty:
    print("No missing 'OCCUPATION' values to predict.")
else:
    # Select features for training (drop 'OCCUPATION' only from features)
    X_train_OCCUPATION = train_data_OCCUPATION.drop(['OCCUPATION'], axis=1)
    y_train_OCCUPATION = train_data_OCCUPATION['OCCUPATION']

    X_test_OCCUPATION = test_data_OCCUPATION.drop(['OCCUPATION'], axis=1)

    # Impute missing values (use 'most_frequent' for categorical, 'mean' for numerical)
    imputer_OCCUPATION = SimpleImputer(strategy='most_frequent')
    X_train_OCCUPATION = pd.DataFrame(imputer_OCCUPATION.fit_transform(X_train_OCCUPATION), columns=X_train_OCCUPATION.columns)
    X_test_OCCUPATION = pd.DataFrame(imputer_OCCUPATION.transform(X_test_OCCUPATION), columns=X_test_OCCUPATION.columns)

    # Handle Class Imbalance with SMOTE
    smote_OCCUPATION = SMOTE(random_state=42)
    X_train_resampled_OCCUPATION, y_train_resampled_OCCUPATION = smote_OCCUPATION.fit_resample(X_train_OCCUPATION, y_train_OCCUPATION)


In [28]:
# Hyperparameter tuning using RandomizedSearchCV
param_grid_OCCUPATION = {
    'n_estimators': [100, 200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a RandomForestClassifier with randomized search
clf_OCCUPATION = RandomForestClassifier(random_state=42)
random_search_OCCUPATION = RandomizedSearchCV(clf_OCCUPATION, param_distributions=param_grid_OCCUPATION, n_iter=10, cv=3, random_state=42, n_jobs=-1)
random_search_OCCUPATION.fit(X_train_resampled_OCCUPATION, y_train_resampled_OCCUPATION)

 # Feature Selection based on importance
selector_OCCUPATION = SelectFromModel(random_search_OCCUPATION.best_estimator_, threshold='median')
X_train_selected_OCCUPATION = selector_OCCUPATION.fit_transform(X_train_resampled_OCCUPATION, y_train_resampled_OCCUPATION)
X_test_selected_OCCUPATION = selector_OCCUPATION.transform(X_test_OCCUPATION)


In [29]:
# Train the best model on the selected features
best_clf_OCCUPATION = random_search_OCCUPATION.best_estimator_
best_clf_OCCUPATION.fit(X_train_selected_OCCUPATION, y_train_resampled_OCCUPATION)

# Predict missing 'OCCUPATION' values for the test data
predicted_occupation_encoded = best_clf_OCCUPATION.predict(X_test_selected_OCCUPATION)

# Convert the predicted encoded values back to their original categorical values
le_occupation = label_encoders['OCCUPATION']  # Retrieve the label encoder for 'OCCUPATION'
predicted_occupation = le_occupation.inverse_transform(predicted_occupation_encoded)

# Check if the number of predicted values matches the number of missing rows
missing_indices = df[df['OCCUPATION'].isnull()].index

if len(missing_indices) != len(predicted_occupation):
    raise ValueError(f"Length mismatch: {len(missing_indices)} missing values but {len(predicted_occupation)} predictions.")

# Replace the missing 'OCCUPATION' values in the original dataframe
df.loc[missing_indices, 'OCCUPATION'] = predicted_occupation

# Check if missing values are imputed
print(df['OCCUPATION'].isnull().sum())

0


## IMPUTE CAR_AGE COLUMN

In [31]:
# Replace the original CAR_AGE column in the original DataFrame with the imputed values
df['CAR_AGE'] = df_imputed[:, 0]

In [32]:
# Check if missing values are imputed
print(df['CAR_AGE'].isnull().sum())

0


## IMPUTE CLM_AMT COLUMN

In [34]:
# Split the data
df_train_CLMAMT = df[df['CLM_AMT'] > 0].copy()
df_predict_CLMAMT = df[df['CLM_AMT'] == 0].copy()

# Define features (X) and target (y) for training
X_train_CLMAMT = df_train_CLMAMT[numerical_columns + categorical_columns]  # Ensure columns used in features exist
y_train_CLMAMT = df_train_CLMAMT['CLM_AMT']

X_predict_CLMAMT = df_predict_CLMAMT[numerical_columns + categorical_columns]  # Ensure columns used in prediction exist

In [35]:
# Preprocessing
preprocessor_CLMAMT = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_columns),
        
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns)
    ]
)

# Step 3: Create and fit the model pipeline
model_pipeline_CLMAMT = Pipeline(steps=[
    ('preprocessor', preprocessor_CLMAMT),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit the model on the training set
model_pipeline_CLMAMT.fit(X_train_CLMAMT, y_train_CLMAMT)

In [36]:
# Predict CLM_AMT for rows where it was originally 0
y_predicted_CLMAMT = model_pipeline_CLMAMT.predict(X_predict_CLMAMT)

# Fill the $0 CLM_AMT values with the predicted values
df.loc[df['CLM_AMT'] == 0, 'CLM_AMT'] = y_predicted_CLMAMT

# Check that there are no more $0 values
zero_claims_after = df[df['CLM_AMT'] == 0].shape[0]
print(f"Number of $0 claim amounts after replacement: {zero_claims_after}")

Number of $0 claim amounts after replacement: 0


## MISSING COLUMNS IN THE DATASET

In [38]:
# Check for missing values in the dataset
df.isna().sum()

ID            0
KIDSDRIV      0
BIRTH         0
AGE           0
HOMEKIDS      0
YOJ           0
INCOME        0
PARENT1       0
HOME_VAL      0
MSTATUS       0
GENDER        0
EDUCATION     0
OCCUPATION    0
TRAVTIME      0
CAR_USE       0
BLUEBOOK      0
TIF           0
CAR_TYPE      0
RED_CAR       0
OLDCLAIM      0
CLM_FREQ      0
REVOKED       0
MVR_PTS       0
CLM_AMT       0
CAR_AGE       0
CLAIM_FLAG    0
URBANICITY    0
dtype: int64

## STANDARDISE THE DATASET

In [40]:
#Convert 'MALE' to 0 and 'FEMALE' to 1

df['GENDER'] = df['GENDER'].map({'M': 0, 'z_F': 1})

In [41]:
print(df['GENDER'].head(20))

0     0
1     0
2     0
3     1
4     0
5     1
6     1
7     1
8     0
9     1
10    0
11    1
12    0
13    1
14    1
15    0
16    0
17    0
18    1
19    1
Name: GENDER, dtype: int64


In [42]:
#Mapping of existing values to standardized values
education_mapping = {
    'z_High School': 'High School',
    'PhD': 'PhD',
    'Bachelors': 'Bachelor',
    '<High School': 'High School',
    'Masters': 'Masters',
}

#Replace the values in the EDUCATION column (L)
df['EDUCATION'] = df['EDUCATION'].replace(education_mapping) 
print (" Education Standarised")

 Education Standarised


In [43]:
print(df['EDUCATION'].head(20))

0             PhD
1     High School
2        Bachelor
3     High School
4     High School
5             PhD
6        Bachelor
7     High School
8     High School
9     High School
10       Bachelor
11       Bachelor
12       Bachelor
13        Masters
14        Masters
15       Bachelor
16            PhD
17    High School
18    High School
19       Bachelor
Name: EDUCATION, dtype: object


In [44]:
# Strip leading/trailing whitespace and standardize to title case
df['OCCUPATION'] = df['OCCUPATION'].str.strip().str.title()

# Replace specific values with the standardized terms
df['OCCUPATION'] = df['OCCUPATION'].replace({
    'Z_Blue Collar': 'Blue Collar',
    'Manager': 'Manager',  # Remove leading space from ' Manager'
    'Professional': 'Professional',
    'Clerical': 'Clerical',
    'Doctor': 'Doctor'
})

print("Names standardized")
print(df['OCCUPATION'].head(20))

Names standardized
0     Professional
1      Blue Collar
2          Manager
3         Clerical
4      Blue Collar
5           Doctor
6      Blue Collar
7      Blue Collar
8          Manager
9      Blue Collar
10     Blue Collar
11        Clerical
12    Professional
13          Lawyer
14    Professional
15         Manager
16         Manager
17        Clerical
18      Home Maker
19        Clerical
Name: OCCUPATION, dtype: object


## Drop Irrelevant columns

In [46]:
columns_to_drop = ['ID', 'KIDSDRIV', 'BIRTH', 'HOMEKIDS', 'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL', 
                   'MSTATUS', 'GENDER', 'OCCUPATION', 'TRAVTIME', 'CAR_USE', 'BLUEBOOK', 'TIF', 
                   'CAR_TYPE', 'RED_CAR', 'OLDCLAIM', 'CLM_FREQ', 'MVR_PTS', 'CLAIM_FLAG', 'URBANICITY']

In [47]:
# Drop the columns from the dataframe
df = df.drop(columns=columns_to_drop)

In [48]:
# Verify if the columns are dropped
print(df.head())

    AGE    EDUCATION REVOKED  CLM_AMT  CAR_AGE
0  11.0          PhD      No    91.65     11.0
1  11.0  High School      No    83.07     11.0
2  11.0     Bachelor      No    87.05     11.0
3  10.0  High School      No   100.65     10.0
4  14.0  High School      No    80.45     14.0


## ENRICHING

In [86]:
# Rename the 'REVOKED' column to 'FRAUD'
df.rename(columns={'REVOKED': 'FRAUD'}, inplace=True)

In [88]:
# Convert 'yes' to 1 and 'no' to 0 in the 'FRAUD' column, accounting for case and stripping spaces
df['FRAUD'] = df['FRAUD'].str.strip().str.lower().map({'yes': 1, 'no': 0})

# Verify the changes
print(df[['FRAUD']].head())

   FRAUD
0      0
1      0
2      0
3      0
4      0


In [90]:
# Verify final columns
print(df.head())

    AGE    EDUCATION  FRAUD  CLM_AMT  CAR_AGE
0  11.0          PhD      0    91.65     11.0
1  11.0  High School      0    83.07     11.0
2  11.0     Bachelor      0    87.05     11.0
3  10.0  High School      0   100.65     10.0
4  14.0  High School      0    80.45     14.0


## FINAL DATASET

In [88]:
# Save the imputed dataset to a CSV file
df.to_csv('clean_dataset3_.csv', index=False)