In [1]:
import pandas as pd
import numpy as np

In [2]:
import joblib
# Load the DataFrame from the Joblib file
data = joblib.load('my_dataframe.joblib')

In [3]:
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

In [4]:
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_columns = X.select_dtypes(include=[np.number]).columns.tolist()

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
# Apply Label Encoding to each categorical column
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [7]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [8]:
y_pred = model.predict(X_test)

In [9]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8461682723813957

In [10]:
from sklearn.metrics import mean_squared_error
# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

#Calculate relative rmse
mean_saleprice = data['SalePrice'].mean()
relative_rmse = (rmse / mean_saleprice) * 100
print("Relative RMSE (%):", relative_rmse)

Root Mean Squared Error (RMSE): 27718.53697423017
Relative RMSE (%): 15.612136362946039


In [11]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation (cv=5), scoring based on negative mean squared error
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Calculate the average score
mean_mse = np.mean(scores)
mean_rmse = np.sqrt(-mean_mse)  # Convert to RMSE (Root Mean Squared Error)

relative_rmse = (mean_rmse / mean_saleprice) * 100

print(f"Mean RMSE: {mean_rmse}")
print("Relative RMSE (%):", relative_rmse)

Mean RMSE: 27940.834129181287
Relative RMSE (%): 15.737342592243722


In [12]:
importances = model.feature_importances_

indices = np.argsort(importances)[::-1]
sorted_features = [(X.columns[i], importances[i]) for i in indices]
print(sorted_features)

[('OverallQual_GrLivArea', 0.6676117226540305), ('TotalLivingSF', 0.051238913589094354), ('BsmtQual', 0.03214474497204485), ('OverallQual', 0.025105878672040595), ('GarageCars', 0.024594697637698636), ('2ndFlrSF', 0.023885766384633713), ('HouseAge', 0.01659437351472556), ('YearBuilt', 0.013982765782331522), ('LotArea', 0.013466756945616283), ('YearRemodAdd', 0.011014494674343001), ('GarageType', 0.007537688921578132), ('GrLivArea', 0.007295220262441883), ('BsmtUnfSF', 0.0072679680783207315), ('Neighborhood', 0.00716864236741735), ('LotFrontage', 0.006765703317509314), ('BsmtFinType1', 0.005748174922177605), ('BsmtExposure', 0.0052566814521663036), ('MasVnrArea', 0.00517336153688833), ('SaleCondition', 0.004739254848185802), ('OpenPorchSF', 0.004400134998356248), ('OverallCond', 0.0040757370971265805), ('KitchenQual', 0.0037997295352702413), ('ExterQual', 0.003593990059765223), ('MoSold', 0.0031885479240127816), ('WoodDeckSF', 0.003002299549793977), ('CentralAir', 0.0029677589441560715)

In [13]:
feature_names = [feature[0] for feature in sorted_features]
# Separate the top features into categorical and numerical
top_numerical_features = [feature for feature in feature_names if feature in numerical_columns]
top_categorical_features = [feature for feature in feature_names if feature in categorical_columns]
print("\nTop Numerical Features:")
print(top_numerical_features[0:10])

print("\nTop Categorical Features:")
print(top_categorical_features[0:10])


Top Numerical Features:
['OverallQual_GrLivArea', 'TotalLivingSF', 'BsmtQual', 'OverallQual', 'GarageCars', '2ndFlrSF', 'HouseAge', 'YearBuilt', 'LotArea', 'YearRemodAdd']

Top Categorical Features:
['GarageType', 'Neighborhood', 'SaleCondition', 'LandContour', 'Exterior1st', 'MSZoning', 'Exterior2nd', 'GarageFinish', 'MasVnrType', 'LotShape']


In [21]:
# Get the top 20 most important features
top_features = sorted_features[0:20]

# Extract the feature names
top_feature_names = [feature[0] for feature in top_features]

# Print the results
print("Top 20 Features (all):")
print(top_feature_names)


Top 20 Features (all):
['OverallQual_GrLivArea', 'TotalLivingSF', 'tot_qual', 'YearBuilt', 'BsmtQual', 'GarageCars', 'YearRemodAdd', 'LotArea', 'OverallQual', 'BsmtUnfSF', 'GrLivArea', 'FireplaceQu', '2ndFlrSF', 'OverallCond', 'BsmtFinType1', 'MSZoning_RM', 'LotFrontage', 'Neighborhood_L2', 'WoodDeckSF', 'BsmtExposure', 'MoSold', 'OpenPorchSF', 'LandContour_Bnk', 'BsmtFullBath', 'MSSubClass', 'Neighborhood_L5', 'GarageType_Attchd', 'YrSold', 'MasVnrArea', 'MSZoning_RL', 'CentralAir', 'GarageFinish_Unf', 'LotShape_Reg', 'LandContour_HLS', 'HeatingQC', 'BedroomAbvGr', 'SaleCondition_Abnorml', 'LandContour_Lvl', 'GarageQual', 'HalfBath']


In [22]:
X_imprt=X[top_feature_names]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_imprt, y, test_size = 0.25, random_state = 0)

### Hyper parameter tuning for random forest

In [212]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50,100,150,200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf= RandomForestRegressor(random_state=0)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best RMSE from Grid Search: ", -grid_search.best_score_)

Best parameters found:  {'bootstrap': False, 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best RMSE from Grid Search:  21037.733309407435


In [24]:
#  new model with the best parameters
best_rf_model = RandomForestRegressor(
    bootstrap=False,
    max_depth=20,
    max_features='log2',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

# Fit the model to the training data
best_rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='log2',
                      n_estimators=200, random_state=42)

In [25]:
y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Root Mean Squared Error: 13639.217450189537
R-squared: 0.9576336649429962
