In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
label_encoders = {}


In [4]:
df = pd.read_csv('dataset3.csv')  # Update with your actual file path


In [5]:
def clean_numeric_column(df, column):
    df[column] = df[column].replace(r'[$,]', '', regex=True).astype(float)

columns_to_convert = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']
for column in columns_to_convert:
    clean_numeric_column(df, column)


In [6]:
df = df.drop('BIRTH', axis=1)


In [7]:
le_occupation = LabelEncoder()
df['OCCUPATION_Encoded'] = le_occupation.fit_transform(df['OCCUPATION'].astype(str))



In [8]:
categorical_columns = ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'CAR_USE',
                       'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY']


In [9]:
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le


In [10]:
train_data = df[df['OCCUPATION'].notnull()]
test_data = df[df['OCCUPATION'].isnull()]



In [11]:
if test_data.empty:
    print("No missing 'OCCUPATION' values to predict.")
else:
    # Step 1: Select features for training (drop 'OCCUPATION' and any non-relevant columns)
    X_train = train_data.drop(['OCCUPATION', 'OCCUPATION_Encoded'], axis=1)
    y_train = train_data['OCCUPATION_Encoded']

    X_test = test_data.drop(['OCCUPATION', 'OCCUPATION_Encoded'], axis=1)

In [12]:
    # Step 2: Train the RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)


In [14]:
  # Step 3: Predict missing 'OCCUPATION' values for the test data
predicted_occupation_encoded = clf.predict(X_test)


In [16]:
# Step 4: Convert the predicted encoded values back to their original categorical values
predicted_occupation = le_occupation.inverse_transform(predicted_occupation_encoded)
    

In [18]:
# Step 5: Replace the missing 'OCCUPATION' values in the original dataframe
df.loc[df['OCCUPATION'].isnull(), 'OCCUPATION'] = predicted_occupation


In [19]:
df = df.drop('OCCUPATION_Encoded', axis=1)



In [20]:
 # Step 7: Calculate the accuracy using a hold-out validation set
# Split train_data into train/validation set for accuracy check
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

clf.fit(X_train_split, y_train_split)  # Train on part of the data
y_val_pred = clf.predict(X_val)  # Predict on validation set


In [21]:
    # Accuracy and classification report
    accuracy = accuracy_score(y_val, y_val_pred)
    print("Accuracy of RandomForestClassifier on validation set:", accuracy)
    print("Classification report:\n", classification_report(y_val, y_val_pred))


Accuracy of RandomForestClassifier on validation set: 0.6628630705394191
Classification report:
               precision    recall  f1-score   support

           0       0.58      0.71      0.64       307
           1       0.48      0.85      0.62        47
           2       0.83      0.68      0.75       145
           3       0.63      0.90      0.74       190
           4       0.51      0.24      0.32       261
           5       0.56      0.47      0.51       296
           6       0.85      0.78      0.81       188
           8       0.75      0.82      0.78       494

    accuracy                           0.66      1928
   macro avg       0.65      0.68      0.65      1928
weighted avg       0.66      0.66      0.65      1928



In [23]:
#Final dataset with 'OCCUPATION' back in categorical form
print(df['OCCUPATION'].head(30))
print(df.isnull().sum())

0      Professional
1     z_Blue Collar
2           Manager
3          Clerical
4     z_Blue Collar
5            Doctor
6     z_Blue Collar
7     z_Blue Collar
8           Manager
9     z_Blue Collar
10    z_Blue Collar
11         Clerical
12     Professional
13           Lawyer
14     Professional
15          Manager
16          Manager
17         Clerical
18       Home Maker
19         Clerical
20          Manager
21           Lawyer
22           Lawyer
23         Clerical
24    z_Blue Collar
25    z_Blue Collar
26           Lawyer
27    z_Blue Collar
28       Home Maker
29     Professional
Name: OCCUPATION, dtype: object
ID              0
KIDSDRIV        0
AGE             7
HOMEKIDS        0
YOJ           548
INCOME        570
PARENT1         0
HOME_VAL      575
MSTATUS         0
GENDER          0
EDUCATION       0
OCCUPATION      0
TRAVTIME        0
CAR_USE         0
BLUEBOOK        0
TIF             0
CAR_TYPE        0
RED_CAR         0
OLDCLAIM        0
CLM_FREQ        0
REVOKED 