In [44]:
import pandas as pd

test_df = pd.read_csv('data/Test.csv')
categorical_cols = ["type_school", "school_accreditation", "gender", "interest", "residence", "parent_was_in_college"]
continuous_cols = [col for col in test_df.columns if col not in categorical_cols and col != 'UniqueID']

for col in categorical_cols:
    most_frequent = test_df[col].mode()[0]
    test_df[col].fillna(most_frequent, inplace=True)

for col in continuous_cols:
    mean_value = test_df[col].mean()
    test_df[col].fillna(mean_value, inplace=True)

X_unique_id = test_df['UniqueID']

test_df.isnull().sum()

UniqueID                 0
type_school              0
school_accreditation     0
gender                   0
interest                 0
residence                0
parent_age               0
parent_salary            0
house_area               0
average_grades           0
parent_was_in_college    0
dtype: int64

In [45]:
X = test_df.drop(columns=["UniqueID"])

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns_to_encode = ["type_school", "school_accreditation", "gender", "interest", "residence", "parent_was_in_college"]
for column in columns_to_encode:
    X[column] = le.fit_transform(X[column])

X = X.drop(columns=["parent_age"])
X.columns

Index(['type_school', 'school_accreditation', 'gender', 'interest',
       'residence', 'parent_salary', 'house_area', 'average_grades',
       'parent_was_in_college'],
      dtype='object')

In [46]:
import joblib

model_filename = 'rf_model_latest.pkl'
loaded_model = joblib.load(model_filename)

y_pred = loaded_model.predict(X)

In [47]:
len(y_pred), len(X_unique_id)

(200, 200)

In [48]:
y_pred_df = pd.DataFrame({'y_pred': y_pred})
result_df = pd.concat([X_unique_id, y_pred_df], axis=1)
result_df.to_csv('rf_hypetune.csv', index=False)