In [179]:
import pandas as pd
import numpy as np

In [180]:
import joblib

# Load the DataFrame from the Joblib file
data = joblib.load('my_dataframe.joblib')


In [181]:
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

In [182]:
# Encoding categorical columns 
#categorical_columns=data.select_dtypes(include=['object']).columns.tolist()
#X=pd.get_dummies(X, columns=categorical_columns)

In [183]:
from sklearn.preprocessing import LabelEncoder

# Assuming X is your DataFrame
# Find categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Initialize a LabelEncoder
label_encoders = {}

# Apply Label Encoding to each categorical column
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [184]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [185]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 50, random_state = 42)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=50, random_state=42)

In [186]:
y_pred = model.predict(X_test)

In [187]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9110606338626234

In [188]:
from sklearn.metrics import mean_squared_error
# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 20441.36825123633


In [189]:
mean_saleprice = data['SalePrice'].mean()
relative_rmse = (rmse / mean_saleprice) * 100
print("Relative RMSE (%):", relative_rmse)

Relative RMSE (%): 11.527204817828874


In [190]:
importances = model.feature_importances_

indices = np.argsort(importances)[::-1]
sorted_features = [(X.columns[i], importances[i]) for i in indices]
print(sorted_features)

[('OverallQual', 0.5955918015418379), ('GrLivArea', 0.10306461798605387), ('Neighborhood', 0.03869450335392112), ('TotalBsmtSF', 0.03174148591232656), ('GarageCars', 0.026589769192080278), ('BsmtFinSF1', 0.02450261882572635), ('GarageArea', 0.021878648225106184), ('1stFlrSF', 0.019738206748977406), ('LotArea', 0.014839293794333576), ('BsmtQual', 0.010531415234638521), ('YearBuilt', 0.009470127024004321), ('YearRemodAdd', 0.008504343542424806), ('GarageType', 0.007906215259954113), ('OverallCond', 0.005688394239819482), ('LotFrontage', 0.0053799734914900576), ('OpenPorchSF', 0.005253653971644209), ('Fireplaces', 0.004619963512182163), ('BsmtUnfSF', 0.0045768349365411945), ('LandContour', 0.004363180301009729), ('WoodDeckSF', 0.004030363886197135), ('MoSold', 0.003936440412085286), ('2ndFlrSF', 0.0039228277826284024), ('KitchenQual', 0.003360054547380838), ('MSSubClass', 0.003143477773517986), ('Exterior1st', 0.0030560588707552487), ('MasVnrArea', 0.002983299020455788), ('TotRmsAbvGrd', 

In [191]:
top_features = sorted_features[0:30]

top_feature_names = [feature[0] for feature in top_features]
print(top_feature_names)

['OverallQual', 'GrLivArea', 'Neighborhood', 'TotalBsmtSF', 'GarageCars', 'BsmtFinSF1', 'GarageArea', '1stFlrSF', 'LotArea', 'BsmtQual', 'YearBuilt', 'YearRemodAdd', 'GarageType', 'OverallCond', 'LotFrontage', 'OpenPorchSF', 'Fireplaces', 'BsmtUnfSF', 'LandContour', 'WoodDeckSF', 'MoSold', '2ndFlrSF', 'KitchenQual', 'MSSubClass', 'Exterior1st', 'MasVnrArea', 'TotRmsAbvGrd', 'GarageFinish', 'CentralAir', 'BedroomAbvGr']


In [192]:
X_imprt=X[top_feature_names]

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_imprt, y, test_size = 0.25, random_state = 0)

### Hyper parameer tuning for random forest

In [194]:
""" from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 20, 40,80,100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=0)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best RMSE from Grid Search: ", -grid_search.best_score_)

"""

' from sklearn.model_selection import GridSearchCV\n\nparam_grid = {\n    \'n_estimators\': [10, 20, 40,80,100],\n    \'max_depth\': [None, 10, 20, 30],\n    \'min_samples_split\': [2, 5, 10],\n    \'min_samples_leaf\': [1, 2, 4],\n    \'bootstrap\': [True, False]\n}\n\nrf = RandomForestRegressor(random_state=0)\ngrid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring=\'neg_root_mean_squared_error\')\ngrid_search.fit(X_train, y_train)\n\nprint("Best parameters found: ", grid_search.best_params_)\nprint("Best RMSE from Grid Search: ", -grid_search.best_score_)\n\n'

In [195]:
#  new model with the best parameters
best_rf_model = RandomForestRegressor(
    bootstrap=True,
    max_depth=20,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

# Fit the model to the training data
best_rf_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=20, random_state=42)