In [58]:
# Q1.

In [59]:
import pandas as pd
import zipfile
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import matplotlib.pyplot as plt

In [60]:
zip_file_path = 'Bengaluru_House_Data.zip'

In [61]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    with zip_ref.open('Bengaluru_House_Data.csv') as csv_file:
        df = pd.read_csv(csv_file)

In [62]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [63]:
df.shape

(13320, 9)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [65]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [66]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [67]:
df.duplicated().sum()

529

In [68]:
df.drop_duplicates(inplace=True)

In [69]:
# Since the features 'Society', 'Availability', 'Area_Type', 'Location' are not crucial for our analysis and does not contribute significantly to the prediction of the house prices, I will consider dropping the entire features.

In [70]:
df.drop(['society'], axis = 1, inplace=True)
df.drop(['area_type'], axis = 1, inplace=True)
df.drop(['availability'], axis = 1, inplace=True)
df.drop(['location'], axis = 1, inplace=True)

In [77]:
mean_bath = round(df['bath'].mean() *2) / 2
mean_balcony = round(df['balcony'].mean() * 2) / 2
df['bath'].fillna(mean_bath, inplace=True)
df['balcony'].fillna(mean_balcony, inplace=True)

In [78]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [79]:
df['num_bedrooms'] = df['size'].str.extract('(\d+)').astype(float)

In [85]:
mean_num_bedrooms = round(df['num_bedrooms'].mean() * 2) / 2
df['num_bedrooms'].fillna(mean_num_bedrooms, inplace=True)

In [86]:
df.drop('size', axis=1, inplace=True)

In [87]:
df.isnull().sum()

total_sqft      0
bath            0
balcony         0
price           0
num_bedrooms    0
dtype: int64

In [88]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,num_bedrooms
0,1056,2.0,1.0,39.07,2.0
1,2600,5.0,3.0,120.0,4.0
2,1440,2.0,3.0,62.0,3.0
3,1521,3.0,1.0,95.0,3.0
4,1200,2.0,1.0,51.0,2.0


In [91]:
df['total_sqft'] = df['total_sqft'].str.extract('([\d.]+)').astype(float)

#df['total_sqft_adjusted'] = df['total_sqft'].apply(lambda x: (float(x.split('-')[0]) + float(x.split('-')[1]))/2 if '-' in x else float(x))

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12791 entries, 0 to 13318
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   total_sqft    12791 non-null  float64
 1   bath          12791 non-null  float64
 2   balcony       12791 non-null  float64
 3   price         12791 non-null  float64
 4   num_bedrooms  12791 non-null  float64
dtypes: float64(5)
memory usage: 599.6 KB


In [101]:
# Split the data into independent and dependent features.

X = df[['total_sqft', 'balcony', 'num_bedrooms', 'bath']]
y = df['price']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
scaler = StandardScaler()

In [106]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [152]:
# Train SVM regression model
svm_model = SVR(kernel='rbf')

In [153]:
svm_model.fit(X_train_scaled, y_train)

In [154]:
y_pred = svm_model.predict(X_test_scaled)

In [155]:
# evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [156]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 15930.219059604553
R-squared: 0.2906192991396195


In [54]:
# For predicting house prices using an SVM regression model, the following regression metrics are commonly used to evaluate the model's performance:

#Mean Squared Error (MSE):Measures the average squared difference between predicted and actual values. Lower MSE indicates better performance.

#R-squared (R2 Score): Represents the proportion of the variance in the dependent variable (house prices) that is predictable from the independent variables (features). Ranges from 0 to 1, where 1 indicates a perfect fit. Higher R2 score indicates better explanatory power.

#Mean Absolute Error (MAE): Measures the average absolute difference between predicted and actual values. Similar to MSE but gives equal weight to all errors.

#Root Mean Squared Error (RMSE): Represents the square root of the average squared difference between predicted and actual values. Provides a similar interpretation as MSE but in the same unit as the target variable.

# So, the choice of the best metric depends on the specific goals of your regression model. MSE and R2 score are commonly used, with MSE being sensitive to outliers and R2 providing an overall measure of model performance.

In [55]:
#Q2.

In [56]:
# Goal: Predict actual house prices accurately.

# Metric Choice: Mean Squared Error (MSE).

# Reasoning: As you can see above, MSE directly measures the average squared difference between predicted and actual values. Emphasizes accuracy by penalizing larger errors more.

#Interpretation: Lower MSE indicates better accuracy in predicting house prices.

#Comparison with R-squared: R-squared focuses on explained variance, not direct prediction accuracy.

#Consideration: MSE is suitable for precise prediction of numerical values like house prices.

In [127]:
#Q3.

In [128]:
#In a scenario with a significant number of outliers when using SVM regression:

#Mean Squared Error (MSE):
#Sensitive to outliers.
#Outliers can heavily influence the squared errors, leading to a larger impact on the metric.

#Mean Absolute Error (MAE):
#Less sensitive to outliers compared to MSE.
#Provides a more robust measure of the average prediction error.

#Huber Loss:
#A hybrid metric that combines MSE and MAE.
#Less sensitive to outliers due to a balance between squared and absolute errors.

#R-squared (Coefficient of Determination):
#Provides a measure of how well the model fits the data.
#Robust to outliers if they don't follow the overall trend.

#For outlier-robust performance evaluation, metrics like MAE or Huber Loss might be more suitable than MSE or R-squared.

In [129]:
#Q4.

In [130]:
#In a scenario where MSE and RMSE are very close for an SVM regression model with a polynomial kernel:

#Choose MSE:
#MSE is the square of RMSE and provides an unambiguous measure of prediction error.
#Easier to interpret and work with in subsequent analyses.

#Consider Interpretability:
#MSE is more straightforward in terms of interpretation.
#Both metrics convey similar information, but MSE is a direct representation of the average squared error.

#Consistency:
#For consistency and simplicity, prefer MSE over RMSE in reporting model performance.

#Choose MSE for its simplicity and direct representation of prediction errors.

In [131]:
#Q5.
#For measuring how well the model explains the variance in the target variable in SVM regression models with different kernels:

#Choose R-squared (Coefficient of Determination):
#Provides a measure of the proportion of variance in the dependent variable explained by the independent variables.
#R-squared ranges from 0 to 1, where 1 indicates a perfect fit.

#Interpretability:
#R-squared is intuitive and easy to interpret, representing the proportion of variance captured by the model.
#Higher R-squared values indicate better explanatory power.

#Model Comparison:
#Use R-squared for consistent comparison across different kernel models.

#Choose R-squared as it specifically measures the explained variance and is widely used for evaluating regression model performance.