In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

# Load the cleaned dataframe.
df = pd.read_csv("/content/drive/MyDrive/ML PROJECT DATASETS/sales_pred/chennai_housing_fully_imputed.csv")

# Identify features and target.
# The `SALES_PRICE` is the target variable.
# `PRT_ID` and `DATE_SALE` are not useful for a model predicting price.
features = df.drop(['SALES_PRICE', 'PRT_ID'], axis=1)
target = df['SALES_PRICE']

# Handle categorical features by one-hot encoding.
# The selected columns are all the object type columns.
categorical_features = features.select_dtypes(include=['object']).columns
features = pd.get_dummies(features, columns=categorical_features, drop_first=True)

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor model.
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get the list of feature names AFTER one-hot encoding and training.
feature_list = list(features.columns)

# Save the trained model and the feature list.
# These files should be placed in the same directory as your Flask app.
joblib.dump(feature_list, '/content/drive/MyDrive/ML PROJECT DATASETS/sales_pred/model_features.joblib')


# Make predictions on the test set.
predictions = model.predict(X_test)

# Evaluate the model's performance.
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse) # Calculate RMSE manually
r2 = r2_score(y_test, predictions)

# Print the evaluation metrics.
print("Model Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

joblib.dump(model, '/content/drive/MyDrive/ML PROJECT DATASETS/sales_pred/random_forest_regressor_model.joblib')

Model Performance:
Mean Absolute Error (MAE): 505504.72
Mean Squared Error (MSE): 408899971327.70
Root Mean Squared Error (RMSE): 639452.87
R-squared (R2) Score: 0.97
