# Cleaning Dataset 3 'OCCUPATION' Column with Random Forest Classifier

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer

In [3]:
# Initialize the dictionary to store label encoders
label_encoders = {}

## EXPLORE THE DATASET

In [5]:
# Load your dataset
df = pd.read_csv('dataset3.csv')

## CLEAN THE DATASET

In [7]:
# Clean 'INCOME' and other columns by removing symbols and converting to numeric
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[$,]', '', regex=True).astype(float)

columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']
for column in columns_to_convert:
    clean_numeric_column(df, column)

# Drop the 'BIRTH' column if not needed
df = df.drop('BIRTH', axis=1)

## CLEAN THE DATASET

In [9]:
# Encode categorical columns including 'OCCUPATION'
le_occupation = LabelEncoder()
df['OCCUPATION_Encoded'] = le_occupation.fit_transform(df['OCCUPATION'].astype(str))

## PREPARE THE DATASET

In [11]:
# List of categorical columns to encode
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE',
                       'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY']

# Apply LabelEncoder to each categorical column
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le

In [12]:
# Split the data into rows with known and unknown 'OCCUPATION'
train_data = df[df['OCCUPATION'].notnull()]
test_data = df[df['OCCUPATION'].isnull()]

In [13]:
# Ensure there are rows in X_test for prediction
if test_data.empty:
    print("No missing 'OCCUPATION' values to predict.")
else:
    # Select features for training (drop 'OCCUPATION' and any non-relevant columns)
    X_train = train_data.drop(['OCCUPATION', 'OCCUPATION_Encoded'], axis=1)
    y_train = train_data['OCCUPATION_Encoded']

    X_test = test_data.drop(['OCCUPATION', 'OCCUPATION_Encoded'], axis=1)

In [14]:
# Impute missing values (for both X_train and X_test)
imputer = SimpleImputer(strategy='mean')  # Use mean for numerical columns
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [15]:
# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [16]:
# Hyperparameter tuning using RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a RandomForestClassifier with randomized search
clf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=10, cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train_resampled, y_train_resampled)

In [17]:
# Feature Selection based on importance
selector = SelectFromModel(random_search.best_estimator_, threshold='median')
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_selected = selector.transform(X_test)

## TRAIN THE MODEL

In [19]:
# Train the best model on the selected features
best_clf = random_search.best_estimator_
best_clf.fit(X_train_selected, y_train_resampled)

In [20]:
# Predict missing 'OCCUPATION' values for the test data
predicted_occupation_encoded = best_clf.predict(X_test_selected)

In [21]:
# Convert the predicted encoded values back to their original categorical values
predicted_occupation = le_occupation.inverse_transform(predicted_occupation_encoded)

# Replace the missing 'OCCUPATION' values in the original dataframe
df.loc[df['OCCUPATION'].isnull(), 'OCCUPATION'] = predicted_occupation

In [22]:
# Remove the encoded column if no longer needed
df = df.drop('OCCUPATION_Encoded', axis=1)

In [23]:
# Check if missing values are imputed
print(df['OCCUPATION'].isnull().sum())

0


In [24]:
# Save the imputed dataset to a CSV file
df.to_csv('RFCimputed_occupation_dataset3.csv', index=False)

## ACCURACY

In [27]:
# Calculate the accuracy using a hold-out validation set
# Split train_data into train/validation set for accuracy check
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_selected, y_train_resampled, test_size=0.2, random_state=42)

best_clf.fit(X_train_split, y_train_split)  # Train on part of the data
y_val_pred = best_clf.predict(X_val)  # Predict on validation set

In [29]:
# Accuracy and classification report
accuracy = accuracy_score(y_val, y_val_pred)
print("Accuracy of RandomForestClassifier on validation set:", accuracy)
print("Classification report:\n", classification_report(y_val, y_val_pred))

Accuracy of RandomForestClassifier on validation set: 0.761813712100519
Classification report:
               precision    recall  f1-score   support

           0       0.67      0.72      0.69       503
           1       0.91      1.00      0.95       462
           2       0.83      0.78      0.81       464
           3       0.78      0.95      0.86       438
           4       0.63      0.51      0.56       432
           5       0.64      0.52      0.58       456
           6       0.87      0.81      0.84       462
           8       0.71      0.80      0.75       444

    accuracy                           0.76      3661
   macro avg       0.76      0.76      0.76      3661
weighted avg       0.76      0.76      0.76      3661



In [31]:
# Final dataset with 'OCCUPATION' back in categorical form
print(df['OCCUPATION'].head(30))
print(df.isnull().sum())

0      Professional
1     z_Blue Collar
2           Manager
3          Clerical
4     z_Blue Collar
5            Doctor
6     z_Blue Collar
7     z_Blue Collar
8           Manager
9     z_Blue Collar
10    z_Blue Collar
11         Clerical
12     Professional
13           Lawyer
14     Professional
15          Manager
16          Manager
17         Clerical
18       Home Maker
19         Clerical
20          Manager
21           Lawyer
22           Lawyer
23         Clerical
24    z_Blue Collar
25    z_Blue Collar
26           Lawyer
27    z_Blue Collar
28       Home Maker
29     Professional
Name: OCCUPATION, dtype: object
ID              0
KIDSDRIV        0
AGE             7
HOMEKIDS        0
YOJ           548
INCOME        570
PARENT1         0
HOME_VAL      575
MSTATUS         0
GENDER          0
EDUCATION       0
OCCUPATION      0
TRAVTIME        0
CAR_USE         0
BLUEBOOK        0
TIF             0
CAR_TYPE        0
RED_CAR         0
OLDCLAIM        0
CLM_FREQ        0
REVOKED 