In [1]:
import pandas as pd
import numpy as np

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [3]:
dataset=pd.read_csv("Bengaluru_House_Data.csv")

In [4]:
dataset.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
dataset.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [6]:
dataset["price"].value_counts()
dataset["size"].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [7]:
dataset.isna().sum()#this show how many null values are there 

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
# Check which columns are present in the DataFrame
columns_to_drop = ['area_type', 'availability', 'society', 'balcony']
existing_columns = [col for col in columns_to_drop if col in dataset.columns]

# Drop only the existing columns
dataset = dataset.drop(columns=existing_columns, axis=1)


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [10]:
dataset.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [11]:
dataset["location"]=dataset["location"].fillna("Sarjapur Road")#fillna will help to fill the null values
dataset["size"]=dataset["size"].fillna("2 BHK")
dataset["bath"]=dataset["bath"].fillna(dataset["bath"].median())#cause bathroom has numerical value, so we use median

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [13]:
dataset["bhk"]=dataset["size"].str.split().str.get(0).astype(int)#as we have 2 types bedroom and bhk, so we use str.split to split them according the size of string and then we get them.

In [14]:
dataset[dataset.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [15]:
dataset["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [16]:
def convertRange(x):
    if isinstance(x, str):  # Check if x is a string
        temp = x.split("-")
        if len(temp) == 2:
            return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(x)
    except ValueError:  # Catch specific error for conversion failure
        return None
#in this we devide the range from - and then take it mean. by adding them and deviding them by 2 if its like 1440 the len of temp is 1 else 1330-1331 then length of temp is 2

In [17]:
dataset["total_sqft"]=dataset["total_sqft"].apply(convertRange)#apply function give the reference of your function and give you value as you input

In [18]:
dataset["price_per_sqfit"]=dataset["price"]*100000/dataset["total_sqft"] #here we got new columns which will give us price per square feet

In [19]:
dataset=dataset.drop(columns=["size"],axis=1)

In [20]:
dataset.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqfit
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [21]:
dataset["location"]=dataset["location"].apply(lambda x:x.strip())
location_count=dataset["location"].value_counts()# first we remove the whitespaces from front and back

In [22]:
location_count_less_10=location_count[location_count<=10]
print(location_count_less_10)

location
BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1054, dtype: int64


In [23]:
dataset["location"]=dataset["location"].apply(lambda x:"other" if x in location_count_less_10 else x)#we have changed these locations to other

In [24]:
dataset["location"].value_counts()

location
other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

Outlier detection and removal

In [25]:
dataset=dataset[((dataset["total_sqft"]/dataset["bhk"])>300)]#i applied the filter in which if any squarefeet is less then 300 will get rmoved
dataset.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqfit
count,12348.0,12348.0,12348.0,12348.0,12348.0
mean,1601.084689,2.542274,110.823169,2.633301,6207.869136
std,1268.788019,1.065177,152.794399,0.964402,4047.666274
min,340.0,1.0,8.44,1.0,267.829813
25%,1115.0,2.0,49.0,2.0,4200.0
50%,1305.0,2.0,69.475,3.0,5264.632169
75%,1708.0,3.0,115.0,3.0,6825.108836
max,52272.0,16.0,3600.0,16.0,176470.588235


In [26]:
dataset["price_per_sqfit"].describe()#here max value is too much it's surely and out lier 

count     12348.000000
mean       6207.869136
std        4047.666274
min         267.829813
25%        4200.000000
50%        5264.632169
75%        6825.108836
max      176470.588235
Name: price_per_sqfit, dtype: float64

In [27]:
def remove_pps_outliers(df):
    # Initialize an empty DataFrame to store the filtered data
    df_out = pd.DataFrame()
    
    # Group the DataFrame by the 'location' column basically ek location ka mean price_per_sqfit
    for key, subdf in df.groupby('location'):
        # Calculate the mean of the 'price_per_sqfit' for the current location
        m = np.mean(subdf.price_per_sqfit)
        
        # Calculate the standard deviation of the 'price_per_sqfit' for the current location
        st = np.std(subdf.price_per_sqfit)
        
        # Filter the DataFrame to keep only rows where 'price_per_sqfit' is within one standard deviation of the mean
        reduced_df = subdf[(subdf.price_per_sqfit > (m - st)) & (subdf.price_per_sqfit <= (m + st))]
        
        # Concatenate the filtered data to the output DataFrame
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    
    # Return the final DataFrame without outliers
    return df_out

# Apply the remove_pps_outliers function to the dataset and update the dataset variable
dataset = remove_pps_outliers(dataset)

# Get the shape (number of rows and columns) of the cleaned dataset
dataset.shape

# Get summary statistics of the cleaned dataset to understand its distribution
dataset.describe()


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqfit
count,10079.0,10079.0,10079.0,10079.0,10079.0
mean,1509.254783,2.461653,90.972439,2.562258,5626.018262
std,881.846858,0.965002,86.431935,0.879,2208.837162
min,350.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4248.717576
50%,1290.0,2.0,67.0,2.0,5166.666667
75%,1650.0,3.0,100.0,3.0,6393.652258
max,30400.0,16.0,2200.0,16.0,24509.803922


In [28]:
def remove_bhk_outliers(df):
    # Initialize an empty array to store indices of rows to be excluded
    exclude_indices = np.array([])
    
    # Group the DataFrame by 'location'
    for location, location_df in df.groupby('location'):
        bhk_stats = {}  # Dictionary to store statistics for each BHK in the location
        
        # Group by 'bhk' (number of bedrooms) and calculate stats for each group
        for bhk, bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk] = {
                "m": np.mean(bhk_df.price_per_sqfit),  # Mean price per square foot for the BHK
                "st": np.std(bhk_df.price_per_sqfit),  # Standard deviation of price per square foot for the BHK
                "count": bhk_df.shape[0]  # Number of data points in this BHK group
            }
        
        # Check for outliers in each BHK group by comparing with stats of the previous BHK group (bhk-1)
        for bhk, bhk_df in location_df.groupby("bhk"):
            stats = bhk_stats.get(bhk - 1)  # Get stats for the previous BHK group (bhk-1)
            if stats and stats["count"] > 5:  # Only consider if the previous group has more than 5 data points
                # Identify outliers where the price per square foot is less than the mean of the previous BHK group
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqfit < stats["m"]].index.values)
    
    # Drop the identified outlier rows from the DataFrame
    return df.drop(exclude_indices, axis="index")

# Note: Make sure to apply this function to your dataset to remove BHK outliers


In [29]:
dataset=remove_bhk_outliers(dataset)

In [30]:
dataset.shape

(7206, 6)

In [37]:
dataset=dataset.drop(columns=["price_per_sqfit"],axis=1)#price per sqfit was the only for use to remove the outlier, outlier were data which have unsusual high data value. like max value  was too much.

In [38]:
dataset.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [41]:
dataset.to_csv("cleaned.csv")#cleaned data is saved in the new csv file 

In [42]:
X=dataset.drop(columns=["price"],axis=1)

In [43]:
y=dataset["price"]

In [44]:
y

0        428.0
1        194.0
2        235.0
3        130.0
4        148.0
         ...  
10070     70.0
10071    200.0
10074    110.0
10075     26.0
10078    400.0
Name: price, Length: 7206, dtype: float64

In [45]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [46]:
X_train.describe()

Unnamed: 0,total_sqft,bath,bhk
count,5764.0,5764.0,5764.0
mean,1502.879177,2.440666,2.487335
std,890.127495,0.989158,0.893037
min,350.0,1.0,1.0
25%,1100.0,2.0,2.0
50%,1260.0,2.0,2.0
75%,1685.0,3.0,3.0
max,30000.0,13.0,13.0


In [47]:
X_train.shape

(5764, 4)

In [48]:
X_test.shape

(1442, 4)

In [52]:
#apply linear regression

In [54]:
column_tran=make_column_transformer((OneHotEncoder(sparse=False),["location"]),remainder="passthrough")# we use onehotencoder on the location to consider as an categorical column

In [55]:
scalar=StandardScaler()

In [57]:
lr=LinearRegression()

In [59]:
pipe=make_pipeline(column_tran,scalar,lr)

In [62]:
pipe.fit(X_train,y_train)#first it will go through column transfrom then standard scalar will done and then finally it will go through the linar regression



In [65]:
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.8662537740529166

In [83]:


# Assuming input_data is a tuple or list with the features in the correct order
input_data = ("1st Phase JP Nagar",1394,2,2)

# Convert the input data to a DataFrame
df_input = pd.DataFrame([input_data], columns=["location", "total_sqft", "bath", "bhk"])

# Make the prediction using the pipeline
ans = pipe.predict(df_input)


In [84]:
ans

array([122.09453775])

LASSO

In [86]:
lasso=Lasso()

In [90]:
pipe1=make_pipeline(column_tran,scalar,lasso)

In [91]:
pipe1.fit(X_train,y_train)



In [93]:
y_pred_lasso=pipe1.predict(X_test)

In [94]:
r2_score(y_test,y_pred_lasso)

0.8485482761884338

In [114]:
# Assuming input_data is a tuple or list with the features in the correct order
input_data = ("2nd Phase Judicial Layout",1450.0,2.0,3)

# Convert the input data to a DataFrame
df_input = pd.DataFrame([input_data], columns=["location", "total_sqft", "bath", "bhk"])

# Make the prediction using the pipeline
ans2 = pipe1.predict(df_input)

In [115]:
ans2

array([84.31113045])

Ridge

In [95]:
ridge=Ridge()

In [100]:
pipe2=make_pipeline(column_tran,scalar,ridge)

In [101]:
pipe2.fit(X_train,y_train)



In [102]:
y_pred_ridge=pipe2.predict(X_test)

In [103]:
r2_score(y_test,y_pred_ridge)

0.8662194182587367

In [111]:
# Assuming input_data is a tuple or list with the features in the correct order
input_data = ("2nd Phase Judicial Layout",1450.0,2.0,3)

# Convert the input data to a DataFrame
df_input = pd.DataFrame([input_data], columns=["location", "total_sqft", "bath", "bhk"])

# Make the prediction using the pipeline
ans1 = pipe2.predict(df_input)

In [112]:
ans1

array([60.19611101])

In [113]:
print(ans1)

[60.19611101]


In [106]:
import pickle

In [109]:
pickle.dump(pipe2,open("RidgeModel.pkl","wb"))#cause ridge and lr are giving the same score so we dumped it 