In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [8]:
df=pd.read_csv(r"C:\Users\BIG JA\desktop\houseprice\Bengaluru_House_Data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.head(15)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
6,Super built-up Area,18-May,Old Airport Road,4 BHK,Jaades,2732,4.0,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4.0,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6.0,,370.0


In [7]:
print(df.isnull().sum())

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [9]:
# Fill missing values
df['location'] = df['location'].fillna('bura')
df['size'] = df['size'].fillna(df['size'].mode()[0])  # Correct use of .mode()
df['society'] = df['society'].fillna('Unknown')
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony']=df['balcony'].fillna(df['balcony'].mean())

print(df.isnull().sum())


area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64


In [10]:
le = LabelEncoder()
df['area_type'] = le.fit_transform(df['area_type'])



# Fit and transform the 'availability' column
df['availability'] = le.fit_transform(df['availability'])

# Inspect the result
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,3,40,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,80,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,0,80,Uttarahalli,3 BHK,Unknown,1440,2.0,3.0,62.0
3,3,80,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,3,80,Kothanur,2 BHK,Unknown,1200,2.0,1.0,51.0


In [13]:

# Step 1: Create the oheHotEncoder instance
ohe = OneHotEncoder(sparse_output=False, drop=None)  # sparse=False returns a dense array

# Step 2: Fit and transform the selected columns
encoded_array = ohe.fit_transform(df[['location', 'size', 'society']])

# Step 3: Get feature names for the ohe-hot encoded columns
encoded_feature_names = ohe.get_feature_names_out(['location', 'size', 'society'])

# Step 4: Create a new DataFrame with the encoded data
df_encoded = pd.DataFrame(encoded_array, columns=encoded_feature_names)

# Optional: Concatenate with the original DataFrame
df_final = pd.concat([df, df_encoded], axis=1)

# Display the result

# Step 1: Drop the original categorical columns
df_dropped = df.drop(['location', 'size', 'society'], axis=1)

# Step 2: Concatenate the original DataFrame (without the dropped columns) with the encoded DataFrame
df_final = pd.concat([df_dropped, df_encoded], axis=1)

# Display the result
print(df_final)




       area_type  availability total_sqft  bath   balcony   price  \
0              3            40       1056   2.0  1.000000   39.07   
1              2            80       2600   5.0  3.000000  120.00   
2              0            80       1440   2.0  3.000000   62.00   
3              3            80       1521   3.0  1.000000   95.00   
4              3            80       1200   2.0  1.000000   51.00   
...          ...           ...        ...   ...       ...     ...   
13315          0            80       3453   4.0  0.000000  231.00   
13316          3            80       3600   5.0  1.584376  400.00   
13317          0            80       1141   2.0  1.000000   60.00   
13318          3            32       4689   4.0  1.000000  488.00   
13319          3            80        550   1.0  1.000000   17.00   

       location_ Anekal  location_ Banaswadi  location_ Basavangudi  \
0                   0.0                  0.0                    0.0   
1                   0.0      

In [14]:
df_final.head()

Unnamed: 0,area_type,availability,total_sqft,bath,balcony,price,location_ Anekal,location_ Banaswadi,location_ Basavangudi,location_ Bhoganhalli,...,society_Xeitaa,society_YCnce R,society_YMhenLi,society_Yaenti,society_ZeodsWo,society_Zonce E,society_Zostaa,society_i1ncyRe,society_i1odsne,society_i1rtsCo
0,3,40,1056,2.0,1.0,39.07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,80,2600,5.0,3.0,120.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,80,1440,2.0,3.0,62.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,80,1521,3.0,1.0,95.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,80,1200,2.0,1.0,51.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
print(df_final.columns)


Index(['area_type', 'availability', 'total_sqft', 'bath', 'balcony', 'price',
       'location_ Anekal', 'location_ Banaswadi', 'location_ Basavangudi',
       'location_ Bhoganhalli',
       ...
       'society_Xeitaa ', 'society_YCnce R', 'society_YMhenLi',
       'society_Yaenti ', 'society_ZeodsWo', 'society_Zonce E',
       'society_Zostaa ', 'society_i1ncyRe', 'society_i1odsne',
       'society_i1rtsCo'],
      dtype='object', length=4032)


In [16]:
def convert_range(value):
    try:
        if '-' in value:  # Handle ranges like "1200-1500"
            values = list(map(float, value.split('-')))
            return sum(values) / len(values)  # Return average of the range
        # Handle single numerical values with extra formatting
        return float(value.replace('sqft', '').strip())  
    except ValueError:  # Handle non-convertible cases
        try:
            # Attempt another pass for numeric cleaning (e.g., removing stray characters)
            return float(''.join(filter(str.isdigit, value)))
        except:
            return float(value) # Return the original value if it can't be converted

# Apply the function to the 'total_sqft' column
df_final['total_sqft'] = df['total_sqft'].apply(convert_range)

# Handle missing or invalid values (if any)
df_final['total_sqft'] = pd.to_numeric(df_final['total_sqft'], errors='coerce')  # Convert non-numeric to NaN
df_final['total_sqft'].fillna(df_final['total_sqft'].mean(), inplace=True)  # Fill NaN with mean

# Print the final DataFrame
print(df_final)


       area_type  availability  total_sqft  bath   balcony   price  \
0              3            40      1056.0   2.0  1.000000   39.07   
1              2            80      2600.0   5.0  3.000000  120.00   
2              0            80      1440.0   2.0  3.000000   62.00   
3              3            80      1521.0   3.0  1.000000   95.00   
4              3            80      1200.0   2.0  1.000000   51.00   
...          ...           ...         ...   ...       ...     ...   
13315          0            80      3453.0   4.0  0.000000  231.00   
13316          3            80      3600.0   5.0  1.584376  400.00   
13317          0            80      1141.0   2.0  1.000000   60.00   
13318          3            32      4689.0   4.0  1.000000  488.00   
13319          3            80       550.0   1.0  1.000000   17.00   

       location_ Anekal  location_ Banaswadi  location_ Basavangudi  \
0                   0.0                  0.0                    0.0   
1                

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final['total_sqft'].fillna(df_final['total_sqft'].mean(), inplace=True)  # Fill NaN with mean


In [17]:
X = df_final.drop(columns=['price'])  # Features
y = df_final['price']  

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(X_train.dtypes)


area_type            int64
availability         int64
total_sqft         float64
bath               float64
balcony            float64
                    ...   
society_Zonce E    float64
society_Zostaa     float64
society_i1ncyRe    float64
society_i1odsne    float64
society_i1rtsCo    float64
Length: 4031, dtype: object


In [20]:
df_final['total_sqft'] = pd.to_numeric(df['total_sqft'], errors='coerce')


In [21]:
df_final.head()

Unnamed: 0,area_type,availability,total_sqft,bath,balcony,price,location_ Anekal,location_ Banaswadi,location_ Basavangudi,location_ Bhoganhalli,...,society_Xeitaa,society_YCnce R,society_YMhenLi,society_Yaenti,society_ZeodsWo,society_Zonce E,society_Zostaa,society_i1ncyRe,society_i1odsne,society_i1rtsCo
0,3,40,1056.0,2.0,1.0,39.07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,80,2600.0,5.0,3.0,120.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,80,1440.0,2.0,3.0,62.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,80,1521.0,3.0,1.0,95.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,80,1200.0,2.0,1.0,51.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.dtypes

area_type            int64
availability         int64
total_sqft         float64
bath               float64
balcony            float64
                    ...   
society_Zonce E    float64
society_Zostaa     float64
society_i1ncyRe    float64
society_i1odsne    float64
society_i1rtsCo    float64
Length: 4031, dtype: object

In [23]:
df_final = df_final.drop(columns=['total_sqft'])


In [24]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_final)

# Convert back to DataFrame for better readability
scaled_df = pd.DataFrame(scaled_data, columns=df_final.columns)
print(scaled_df)

       area_type  availability      bath       balcony     price  \
0       0.620461     -1.678930 -0.514538 -7.319972e-01 -0.493372   
1      -0.253816      0.481775  1.726436  1.773231e+00  0.049906   
2      -2.002369      0.481775 -0.514538  1.773231e+00 -0.339444   
3       0.620461      0.481775  0.232453 -7.319972e-01 -0.117917   
4       0.620461      0.481775 -0.514538 -7.319972e-01 -0.413286   
...          ...           ...       ...           ...       ...   
13315  -2.002369      0.481775  0.979445 -1.984611e+00  0.795043   
13316   0.620461      0.481775  1.726436 -2.781362e-16  1.929529   
13317  -2.002369      0.481775 -0.514538 -7.319972e-01 -0.352870   
13318   0.620461     -2.111071  0.979445 -7.319972e-01  2.520267   
13319   0.620461      0.481775 -1.261529 -7.319972e-01 -0.641526   

       location_ Anekal  location_ Banaswadi  location_ Basavangudi  \
0             -0.008665            -0.008665              -0.008665   
1             -0.008665            -0.008

In [25]:
print(df_final.isnull().sum())  # Check for missing values
df_final = df_final.fillna(df_final.mean())  # Example: filling missing values



area_type          0
availability       0
bath               0
balcony            0
price              0
                  ..
society_Zonce E    0
society_Zostaa     0
society_i1ncyRe    0
society_i1odsne    0
society_i1rtsCo    0
Length: 4031, dtype: int64


In [26]:
scaler = StandardScaler()

# Fit and transform the entire dataset (assuming it is numeric now)
scaled_data = scaler.fit_transform(df_final)

In [27]:
scaled_df = pd.DataFrame(scaled_data, columns=df_final.columns)

# Display the scaled DataFrame
print(scaled_df)

       area_type  availability      bath       balcony     price  \
0       0.620461     -1.678930 -0.514538 -7.319972e-01 -0.493372   
1      -0.253816      0.481775  1.726436  1.773231e+00  0.049906   
2      -2.002369      0.481775 -0.514538  1.773231e+00 -0.339444   
3       0.620461      0.481775  0.232453 -7.319972e-01 -0.117917   
4       0.620461      0.481775 -0.514538 -7.319972e-01 -0.413286   
...          ...           ...       ...           ...       ...   
13315  -2.002369      0.481775  0.979445 -1.984611e+00  0.795043   
13316   0.620461      0.481775  1.726436 -2.781362e-16  1.929529   
13317  -2.002369      0.481775 -0.514538 -7.319972e-01 -0.352870   
13318   0.620461     -2.111071  0.979445 -7.319972e-01  2.520267   
13319   0.620461      0.481775 -1.261529 -7.319972e-01 -0.641526   

       location_ Anekal  location_ Banaswadi  location_ Basavangudi  \
0             -0.008665            -0.008665              -0.008665   
1             -0.008665            -0.008

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and apply StandardScaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)


In [33]:
y_pred = model.predict(X_test_scaled)

# Evaluate the model using various metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("Mean Squared Error:", mse)
print("R² Score:", r2)

# Display the predictions vs actual values
print("\nPredictions vs Actual Values:")
predictions_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test})
print(predictions_df)

Mean Squared Error: 9622.740659432804
R² Score: 0.5480273626631164

Predictions vs Actual Values:
        Predicted  Actual
8077    15.292423    64.8
1602    72.526290   125.0
10498   78.180679    60.0
3297  -110.688919   110.0
8893   229.311514   210.0
...           ...     ...
1082    11.069007    14.0
1671   224.471893    44.5
4325    95.253246    65.0
7375    57.788018    52.0
5152   144.300752   130.0

[2664 rows x 2 columns]
