In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv('sample_data.csv')
df.head()

Unnamed: 0,House_Type,Size,Bedrooms,Bathrooms,Floor,Furnished,For_rent,Region,City,Price
0,0,200,3,3,1,0,Yes,55,1,12000
1,0,130,3,2,2,1,Yes,55,1,8500
2,0,130,2,1,3,0,Yes,55,1,5000
3,0,200,3,1,0,0,Yes,54,1,8500
4,0,160,3,2,0,0,Yes,55,1,7000


In [4]:
df['no_of_rooms'] = df['Bedrooms'] + df['Bathrooms']
df.head()

Unnamed: 0,House_Type,Size,Bedrooms,Bathrooms,Floor,Furnished,For_rent,Region,City,Price,no_of_rooms
0,0,200,3,3,1,0,Yes,55,1,12000,6
1,0,130,3,2,2,1,Yes,55,1,8500,5
2,0,130,2,1,3,0,Yes,55,1,5000,3
3,0,200,3,1,0,0,Yes,54,1,8500,4
4,0,160,3,2,0,0,Yes,55,1,7000,5


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6782 entries, 0 to 6781
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   House_Type   6782 non-null   int64 
 1   Size         6782 non-null   int64 
 2   Bedrooms     6782 non-null   int64 
 3   Bathrooms    6782 non-null   int64 
 4   Floor        6782 non-null   int64 
 5   Furnished    6782 non-null   int64 
 6   For_rent     6782 non-null   object
 7   Region       6782 non-null   int64 
 8   City         6782 non-null   int64 
 9   Price        6782 non-null   int64 
 10  no_of_rooms  6782 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 583.0+ KB


In [6]:
df.describe()

Unnamed: 0,House_Type,Size,Bedrooms,Bathrooms,Floor,Furnished,Region,City,Price,no_of_rooms
count,6782.0,6782.0,6782.0,6782.0,6782.0,6782.0,6782.0,6782.0,6782.0,6782.0
mean,0.388234,161.253465,2.67399,2.015777,2.937187,0.388676,52.032586,1.026246,13831.794456,4.689767
std,1.411541,67.913674,0.73107,0.836361,2.813613,0.487485,15.026407,0.557202,15892.072568,1.389453
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2000.0,2.0
25%,0.0,115.0,2.0,1.0,1.0,0.0,43.0,1.0,6500.0,4.0
50%,0.0,150.0,3.0,2.0,2.0,0.0,55.0,1.0,10000.0,5.0
75%,0.0,200.0,3.0,3.0,4.0,1.0,62.0,1.0,17000.0,6.0
max,7.0,400.0,5.0,4.0,12.0,1.0,78.0,2.0,300000.0,9.0


In [7]:
X = df.drop(['Price', 'For_rent'], axis=1)
y = df['Price']

In [8]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LinearRegression

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [16]:
import joblib

# Assuming best_gb_regressor is your trained Gradient Boosting Regressor model

# Save the model to a file
joblib.dump(model, 'model3.pkl')

['model3.pkl']

In [17]:
y_pred = model.predict(X_test)

In [18]:
# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")

Root Mean Squared Error (RMSE): 15617.660354775175
R-squared (R2) Score: 0.12736498525918372


# GradientBoostingRegressor

In [12]:
# Initialize Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = gb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")

Root Mean Squared Error (RMSE): 15012.013796250443
R-squared (R2) Score: 0.1937335335269348


## XGBRegressor


In [13]:
# Initialize XGBoost Regressor
xgb_regressor = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
xgb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = xgb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression - Root Mean Squared Error (RMSE): {rmse}")
print(f"XGBoost Regression - R-squared (R2) Score: {r2}")

XGBoost Regression - Root Mean Squared Error (RMSE): 13999.144862634157
XGBoost Regression - R-squared (R2) Score: 0.29886168126816626


## RandomForestRegressor

In [14]:
# Initialize Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Random Forest Regression - Root Mean Squared Error (RMSE): {rmse}")
print(f"Random Forest Regression - R-squared (R2) Score: {r2}")

Random Forest Regression - Root Mean Squared Error (RMSE): 14926.798082532536
Random Forest Regression - R-squared (R2) Score: 0.20286109853563805


In [15]:
import joblib

# Assuming best_gb_regressor is your trained Gradient Boosting Regressor model

# Save the model to a file
joblib.dump(rf_regressor, 'model2.h5')

['model2.h5']

## GradientBoostingRegressor

In [38]:
# Initialize Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(gb_regressor, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_gb_regressor = grid_search.best_estimator_

# Make predictions
y_pred = best_gb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Gradient Boosting Regression - Root Mean Squared Error (RMSE): {rmse}")
print(f"Gradient Boosting Regression - R-squared (R2) Score: {r2}")
print(f"Best Hyperparameters: {grid_search.best_params_}")

Gradient Boosting Regression - Root Mean Squared Error (RMSE): 14207.80505557657
Gradient Boosting Regression - R-squared (R2) Score: 0.27780468502148115
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}
