In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import r2_score

In [2]:
df=pd.read_csv('house_prices.csv')

In [6]:
df.tail()

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,...,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
187526,187526,3 BHK Ready to Occupy Flat for sale in Bollywo...,This magnificent 3 BHK Flat is available for s...,63 Lac,3225.0,zirakpur,,Ready to Move,2 out of 4,New Property,...,East,Garden/Park,Bollywood Esencia,3,3.0,1 Covered,Freehold,1953 sqft,,
187527,187527,3 BHK Ready to Occupy Flat for sale in Sushma ...,Have a look at this immaculate 3 BHK flat for ...,55 Lac,3274.0,zirakpur,,Ready to Move,4 out of 6,Resale,...,North - East,"Garden/Park, Main Road",Sushma Urban Views,3,,1 Covered,,1680 sqft,,
187528,187528,3 BHK Ready to Occupy Flat for sale in Bollywo...,"Gazipur, Zirakpur has an appealing 3 BHK flat ...",76 Lac,4343.0,zirakpur,1250 sqft,Ready to Move,1 out of 3,Resale,...,East,"Garden/Park, Main Road",Bollywood Esencia,3,2.0,"1 Covered,",Freehold,,,
187529,187529,2 BHK Ready to Occupy Flat for sale in Friends...,Up for immediate sale is a 2 BHK apartment in ...,30 Lac,4231.0,zirakpur,,Ready to Move,2 out of 2,Resale,...,,Main Road,Friends Enclave,2,,,,709 sqft,,
187530,187530,3 BHK Ready to Occupy Flat for sale in Affinit...,This exquisite 3 BHK Flat is offered for sale ...,1.18 Cr,6162.0,zirakpur,,Ready to Move,5 out of 13,Resale,...,North - East,"Garden/Park, Pool",Affinity Greens,4,4.0,1 Covered,Freehold,1915 sqft,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Index              187531 non-null  int64  
 1   Title              187531 non-null  object 
 2   Description        184508 non-null  object 
 3   Amount(in rupees)  187531 non-null  object 
 4   Price (in rupees)  169866 non-null  float64
 5   location           187531 non-null  object 
 6   Carpet Area        106858 non-null  object 
 7   Status             186916 non-null  object 
 8   Floor              180454 non-null  object 
 9   Transaction        187448 non-null  object 
 10  Furnishing         184634 non-null  object 
 11  facing             117298 non-null  object 
 12  overlooking        106095 non-null  object 
 13  Society            77853 non-null   object 
 14  Bathroom           186703 non-null  object 
 15  Balcony            138596 non-null  object 
 16  Ca

In [5]:
df.isnull().sum()

Index                     0
Title                     0
Description            3023
Amount(in rupees)         0
Price (in rupees)     17665
location                  0
Carpet Area           80673
Status                  615
Floor                  7077
Transaction              83
Furnishing             2897
facing                70233
overlooking           81436
Society              109678
Bathroom                828
Balcony               48935
Car Parking          103357
Ownership             65517
Super Area           107685
Dimensions           187531
Plot Area            187531
dtype: int64

In [6]:
def null_avg():
    totals_rows = df.shape[0]
    avge_nulls = (df.isnull().sum() / totals_rows) * 100
    print(avge_nulls)
null_avg()

Index                  0.000000
Title                  0.000000
Description            1.612000
Amount(in rupees)      0.000000
Price (in rupees)      9.419776
location               0.000000
Carpet Area           43.018488
Status                 0.327946
Floor                  3.773776
Transaction            0.044259
Furnishing             1.544811
facing                37.451408
overlooking           43.425354
Society               58.485264
Bathroom               0.441527
Balcony               26.094352
Car Parking           55.114621
Ownership             34.936624
Super Area            57.422506
Dimensions           100.000000
Plot Area            100.000000
dtype: float64


In [7]:
df.drop(columns=['Society', 'Car Parking', 'Super Area', 'Dimensions', 'Plot Area','Index',
                 'overlooking','Carpet Area','Title','Description'], inplace=True)

In [8]:
df.dropna(subset=['Bathroom'], inplace=True)
df.Bathroom.isnull().sum()
df['Balcony'].fillna(0,inplace=True)
df.Balcony.isnull().sum()

0

In [9]:
null_avg()

Amount(in rupees)     0.000000
Price (in rupees)     9.421916
location              0.000000
Status                0.329400
Floor                 3.399517
Transaction           0.043920
Furnishing            1.138707
facing               37.194903
Bathroom              0.000000
Balcony               0.000000
Ownership            34.674322
dtype: float64


In [10]:
# 1. Fill null values in 'Description', 'Facing', 'Overlooking', and 'Ownership' with 'Unknown'
cols_to_fill_unknown = ['facing', 'Ownership','Floor','Status','Transaction','Furnishing']
df[cols_to_fill_unknown] = df[cols_to_fill_unknown].fillna('Unknown')

# Convert 'Amount(in rupees)' to numerical format
def convert_amount(amount):
    try:
        if 'Lac' in amount:
            amount = amount.replace('Lac', '').strip()
            return float(amount) * 100000  # Convert Lac to rupees (1 Lac = 100000 rupees)
        elif 'Cr' in amount:
            amount = amount.replace('Cr', '').strip()
            return float(amount) * 10000000  # Convert Cr to rupees (1 Cr = 10000000 rupees)
        else:
            return float(amount)
    except ValueError:
        return None

df['Amount(in rupees)'] = df['Amount(in rupees)'].apply(convert_amount)

In [11]:
df.head()

Unnamed: 0,Amount(in rupees),Price (in rupees),location,Status,Floor,Transaction,Furnishing,facing,Bathroom,Balcony,Ownership
0,4200000.0,6000.0,thane,Ready to Move,10 out of 11,Resale,Unfurnished,Unknown,1,2,Unknown
1,9800000.0,13799.0,thane,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,2,0,Freehold
2,14000000.0,17500.0,thane,Ready to Move,10 out of 29,Resale,Unfurnished,East,2,0,Freehold
3,2500000.0,,thane,Ready to Move,1 out of 3,Resale,Unfurnished,Unknown,1,1,Unknown
4,16000000.0,18824.0,thane,Ready to Move,20 out of 42,Resale,Unfurnished,West,2,0,Co-operative Society


In [12]:
df = df.dropna(subset=['Price (in rupees)'])

In [13]:
null_avg()

Amount(in rupees)    0.0
Price (in rupees)    0.0
location             0.0
Status               0.0
Floor                0.0
Transaction          0.0
Furnishing           0.0
facing               0.0
Bathroom             0.0
Balcony              0.0
Ownership            0.0
dtype: float64


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169112 entries, 0 to 187530
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Amount(in rupees)  169112 non-null  float64
 1   Price (in rupees)  169112 non-null  float64
 2   location           169112 non-null  object 
 3   Status             169112 non-null  object 
 4   Floor              169112 non-null  object 
 5   Transaction        169112 non-null  object 
 6   Furnishing         169112 non-null  object 
 7   facing             169112 non-null  object 
 8   Bathroom           169112 non-null  object 
 9   Balcony            169112 non-null  object 
 10  Ownership          169112 non-null  object 
dtypes: float64(2), object(9)
memory usage: 15.5+ MB


In [15]:
def convertir_valor(valor):
    if isinstance(valor, str) and '>' in valor:
        return int(valor.replace('>', ''))
    else:
        return int(valor)

# Aplicar la función a la columna 'Columna1'
df['Bathroom'] = df['Bathroom'].apply(convertir_valor)
df['Balcony'] = df['Balcony'].apply(convertir_valor)

In [16]:
df.Bathroom = df.Bathroom.astype(int)
df.Balcony = df.Balcony.astype(int)

cat_val=df.select_dtypes(include=['object']).columns

le=LabelEncoder()
for i in cat_val:
    df[i]=le.fit_transform(df[i])
    
df.sample(5)

Unnamed: 0,Amount(in rupees),Price (in rupees),location,Status,Floor,Transaction,Furnishing,facing,Bathroom,Balcony,Ownership
126206,43500000.0,21481.0,50,0,346,0,1,0,4,2,1
165172,3800000.0,4000.0,22,0,350,3,1,7,2,2,4
145793,4560000.0,4829.0,57,0,134,0,2,0,2,2,0
49189,5200000.0,3714.0,13,0,135,3,3,7,3,0,4
179455,13400000.0,4621.0,66,0,821,0,2,0,5,5,1


In [17]:
X = df.drop('Amount(in rupees)', axis=1)
y = df['Amount(in rupees)']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)

print("R-squared (R2):", r2)

R-squared (R2): 0.7804901762341945
