In [2]:

import pandas as pd

# Load the dataset
housing_data = pd.read_csv("Housing.csv")

# Display the first few rows of the dataset to understand its structure
housing_data.head(), housing_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


(      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
 0  13300000  7420         4          2        3      yes        no       no   
 1  12250000  8960         4          4        4      yes        no       no   
 2  12250000  9960         3          2        2      yes        no      yes   
 3  12215000  7500         4          2        2      yes        no      yes   
 4  11410000  7420         4          1        2      yes       yes      yes   
 
   hotwaterheating airconditioning  parking prefarea furnishingstatus  
 0              no             yes        2      yes        furnished  
 1              no             yes        3       no        furnished  
 2              no              no        2      yes   semi-furnished  
 3              no             yes        3      yes        furnished  
 4              no             yes        2       no        furnished  ,
 None)

In [4]:

# Check for missing values
missing_values = housing_data.isnull().sum()

# Fill missing values (if any) using mean or mode for simplicity
#housing_data.fillna(housing_data.mean(), inplace=True)

# Verifying that missing values are handled
housing_data.isnull().sum()


price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of the target variable (price)
plt.figure(figsize=(10, 6))
sns.histplot(housing_data['price'], kde=True)
plt.title('Distribution of Housing Prices')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
corr_matrix = housing_data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [6]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Preprocessing: Convert categorical variables to numeric using LabelEncoder
label_encoders = {}
categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                       'airconditioning', 'prefarea', 'furnishingstatus']

for col in categorical_columns:
    le = LabelEncoder()
    housing_data[col] = le.fit_transform(housing_data[col])
    label_encoders[col] = le

# Define the features (X) and target (y)
X = housing_data.drop('price', axis=1)
y = housing_data['price']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Train a Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)


In [None]:

# Evaluate the model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

f"Mean Absolute Error: {mae}", f"Root Mean Squared Error: {rmse}"
