In [None]:
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# from sklearn.preprocessing import StandardScaler

# # Load the dataset
# df = pd.read_csv('Cleaned_KL_Housing_Dataset.csv')

# # Feature Engineering: Creating interaction terms
# df['Rooms*Bathrooms'] = df['Rooms'] * df['Bathrooms']
# df['Rooms*Sqft'] = df['Rooms'] * df['Sqft']

# # Separate features and target variable
# X = df.drop('Price', axis=1)
# y = df['Price']

# # One-hot encode categorical features
# X = pd.get_dummies(X)

# # Standardizing numerical features
# scaler = StandardScaler()
# X[['Rooms', 'Bathrooms', 'Car Parks', 'Sqft']] = scaler.fit_transform(X[['Rooms', 'Bathrooms', 'Car Parks', 'Sqft']])

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# # Create a GridSearchCV object for hyperparameter tuning
# grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)

# # Print the best parameters
# print(f'Best Parameters: {grid_search.best_params_}')
# best_model = grid_search.best_estimator_

# # Evaluate the best model
# score = best_model.score(X_test, y_test)
# print(f'Optimized Model Score: {score}')

# # Perform cross-validation to evaluate the model
# cv_scores = cross_val_score(best_model, X, y, cv=5)
# print(f'Cross-Validation Scores: {cv_scores}')
# print(f'Average CV Score: {cv_scores.mean()}')


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
df = pd.read_csv('Cleaned_KL_Housing_Dataset.csv')

# Separate features and target variable
X = df.drop('Price', axis=1)
y = df['Price']

# One-hot encode categorical features
X = pd.get_dummies(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
rf_clf = RandomForestRegressor(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')
best_rf_clf = grid_search.best_estimator_

# Evaluate the model
y_pred = best_rf_clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Model Score (R^2): {r2}')
print(f'Mean Squared Error: {mse}')

# Cross-validation score
cv_scores = cross_val_score(best_rf_clf, X, y, cv=5, scoring='r2')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Cross-Validation Score: {cv_scores.mean()}')


Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Model Score (R^2): 0.8773788092360661
Mean Squared Error: 0.15365120409952188
Cross-Validation Scores: [0.86755219 0.88573145 0.88152221 0.88322638 0.89413282]
Mean Cross-Validation Score: 0.8824330086962634
