# **Loading The Dataset**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('/content/Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# **Information About The Dataset**

In [None]:
df.shape

(13320, 9)

In [None]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [None]:
print(df.isnull().sum())


area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [None]:
print(df.describe())


               bath       balcony         price
count  13247.000000  12711.000000  13320.000000
mean       2.692610      1.584376    112.565627
std        1.341458      0.817263    148.971674
min        1.000000      0.000000      8.000000
25%        2.000000      1.000000     50.000000
50%        2.000000      2.000000     72.000000
75%        3.000000      2.000000    120.000000
max       40.000000      3.000000   3600.000000


In [None]:
df.groupby("area_type")["area_type"].agg("count")


area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

# **Data Cleaning & Preprocessing**

In [None]:
df = df.drop(["area_type","availability"], axis = "columns")

In [None]:
df.shape

(13320, 7)

In [None]:
df = df.dropna()

In [None]:
df.shape

(7496, 7)

In [None]:
df.isnull().sum()

location      0
size          0
society       0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [None]:
df['BHK'] = df["size"].apply(lambda x: int(x.split(" ")[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BHK'] = df["size"].apply(lambda x: int(x.split(" ")[0]))


In [None]:
df = df.drop('size',axis='columns')
df.head()

Unnamed: 0,location,society,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,Coomee,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,Theanmp,2600,5.0,3.0,120.0,4
3,Lingadheeranahalli,Soiewre,1521,3.0,1.0,95.0,3
5,Whitefield,DuenaTa,1170,2.0,1.0,38.0,2
11,Whitefield,Prrry M,2785,5.0,3.0,295.0,4


In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df[~df["total_sqft"].apply(is_float)].head(10)

Unnamed: 0,location,society,total_sqft,bath,balcony,price,BHK
30,Yelahanka,LedorSa,2100 - 2850,4.0,0.0,186.0,4
122,Hebbal,SNontle,3067 - 8156,4.0,0.0,477.0,4
137,8th Phase JP Nagar,Vaarech,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,Kinuerg,1145 - 1340,2.0,0.0,43.49,2
188,KR Puram,MCvarar,1015 - 1540,2.0,0.0,56.8,2
549,Hennur Road,Shxorm,1195 - 1440,2.0,0.0,63.77,2
661,Yelahanka,Rarthne,1120 - 1145,2.0,0.0,48.13,2
672,Bettahalsoor,Toainnt,3090 - 5002,4.0,0.0,445.0,4
772,Banashankari Stage VI,Brens7,1160 - 1195,2.0,0.0,59.935,2
850,Bannerghatta Road,PrarePa,1115 - 1130,2.0,0.0,58.935,2


In [None]:
def convert_sqft_to_number(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

df = df.copy()
df["total_sqft"] = df["total_sqft"].apply(convert_sqft_to_number)
df.head(10)


Unnamed: 0,location,society,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,Coomee,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,Theanmp,2600.0,5.0,3.0,120.0,4
3,Lingadheeranahalli,Soiewre,1521.0,3.0,1.0,95.0,3
5,Whitefield,DuenaTa,1170.0,2.0,1.0,38.0,2
11,Whitefield,Prrry M,2785.0,5.0,3.0,295.0,4
12,7th Phase JP Nagar,Shncyes,1000.0,2.0,1.0,38.0,2
14,Sarjapur,Skityer,2250.0,3.0,2.0,148.0,3
15,Mysore Road,PrntaEn,1175.0,2.0,2.0,73.5,2
16,Bisuvanahalli,Prityel,1180.0,3.0,2.0,48.0,3
17,Raja Rajeshwari Nagar,GrrvaGr,1540.0,3.0,3.0,60.0,3


In [None]:
df.location = df.location.apply(lambda x: x.strip())
location_stats = df['location'].value_counts(ascending=False)
location_stats

Whitefield         398
Sarjapur  Road     310
Electronic City    238
Kanakpura Road     216
Thanisandra        215
                  ... 
HAL 2nd Stage        1
Adarsh Nagar         1
S R Layout           1
Sadduguntepalya      1
Abshot Layout        1
Name: location, Length: 626, dtype: int64

In [None]:
len(location_stats[location_stats<=5])

409

In [None]:
location_stats_less_than_5 = location_stats[location_stats<=5]
location_stats_less_than_5

Maragondanahalli    5
Siddapura           5
Judicial Layout     5
VGP Layout          5
Ulsoor              5
                   ..
HAL 2nd Stage       1
Adarsh Nagar        1
S R Layout          1
Sadduguntepalya     1
Abshot Layout       1
Name: location, Length: 409, dtype: int64

In [None]:
df.location = df.location.apply(lambda x: 'other' if x in location_stats_less_than_5 else x)
len(df.location.unique())

218

In [None]:
df.society = df.society.apply(lambda x: x.strip())
society_stats = df['society'].value_counts(ascending=False)
society_stats

GrrvaGr    80
PrarePa    71
Sryalan    59
Prtates    58
GMown E    56
           ..
SLnorMa     1
Heatee      1
Nihtsur     1
TGjraVa     1
RSntsAp     1
Name: society, Length: 2592, dtype: int64

In [None]:
len(society_stats[society_stats<=5])

2324

In [None]:
society_stats_less_than_5 = society_stats[society_stats<=5]
society_stats_less_than_5

NaonySy    5
Goandn     5
PSterAs    5
Brony H    5
Maana E    5
          ..
SLnorMa    1
Heatee     1
Nihtsur    1
TGjraVa    1
RSntsAp    1
Name: society, Length: 2324, dtype: int64

In [None]:
df.society = df.society.apply(lambda x: 'others' if x in society_stats_less_than_5 else x)
len(df.society.unique())

269

In [None]:
df[df.total_sqft/df.BHK<250].head()

Unnamed: 0,location,society,total_sqft,bath,balcony,price,BHK
349,other,others,11.0,3.0,2.0,74.0,3
483,other,others,1200.0,5.0,2.0,190.0,5
3097,other,others,1200.0,5.0,0.0,195.0,5
4828,Yelachenahalli,others,900.0,2.0,1.0,115.0,4
5582,Bommanahalli,others,1200.0,7.0,2.0,125.0,5


In [None]:
df = df[~(df.total_sqft/df.BHK<250)]
df.shape

(7480, 7)

In [None]:
display(df.head())

Unnamed: 0,location,society,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,others,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,Theanmp,2600.0,5.0,3.0,120.0,4
3,Lingadheeranahalli,Soiewre,1521.0,3.0,1.0,95.0,3
5,Whitefield,others,1170.0,2.0,1.0,38.0,2
11,Whitefield,Prrry M,2785.0,5.0,3.0,295.0,4


## Checking for Outliers

In [None]:
def detect_extreme_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1


    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR


    extreme_outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

    return extreme_outliers

# Columns to check for extreme outliers
columns_to_check = ["total_sqft", "bath", "balcony", "price", "BHK"]

# Detect extreme outliers for each column and print them
for column in columns_to_check:
    extreme_outliers = detect_extreme_outliers(df, column)
    print(f"Extreme outliers in {column}:")
    print(extreme_outliers)
    print("\n")

Extreme outliers in total_sqft:
              location  society  total_sqft  bath  balcony  price  BHK
62          Whitefield   others      5700.0   5.0      3.0  650.0    4
122             Hebbal  SNontle      5611.5   4.0      0.0  477.0    4
248         Meenakunte  Sreat R      4050.0   3.0      2.0  280.0    3
277         Meenakunte  Sreat R      4111.0   4.0      2.0  250.0    4
455              other   others      3309.0   4.0      3.0  325.0    4
...                ...      ...         ...   ...      ...    ...  ...
13208           Hebbal  Brium C      4000.0   6.0      1.0  370.0    4
13268        EPIP Zone  BrontLa      3360.0   5.0      2.0  221.0    4
13290   Sarjapur  Road   others      4050.0   2.0      1.0  450.0    4
13315       Whitefield   others      3453.0   4.0      0.0  231.0    5
13318  Padmanabhanagar   others      4689.0   4.0      1.0  488.0    4

[271 rows x 7 columns]


Extreme outliers in bath:
                location  society  total_sqft  bath  balcony   p

In [None]:
extreme_bhk_outliers = detect_extreme_outliers(df, "BHK")
df = df[~df.index.isin(extreme_bhk_outliers.index)]


In [None]:
df['price'] *= 100000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] *= 100000


In [None]:
df.head()

Unnamed: 0,location,society,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,others,1056.0,2.0,1.0,3907000.0,2
1,Chikka Tirupathi,Theanmp,2600.0,5.0,3.0,12000000.0,4
3,Lingadheeranahalli,Soiewre,1521.0,3.0,1.0,9500000.0,3
5,Whitefield,others,1170.0,2.0,1.0,3800000.0,2
11,Whitefield,Prrry M,2785.0,5.0,3.0,29500000.0,4


In [None]:
#df.to_excel("semi.xlsx", index=False)

## One Hot Encoding on Locations & Societies

In [None]:

df = pd.get_dummies(df, columns=['location', 'society'], drop_first=True, prefix=['l', 's'])

# For 'location' columns
location_columns = [col.split('_')[1] for col in df.columns if col.startswith('l_')]
# For 'society' columns
society_columns = [col.split('_')[1] for col in df.columns if col.startswith('s_')]

column_mapping = {**dict(zip(df.columns[df.columns.str.startswith('l_')], location_columns)),
                  **dict(zip(df.columns[df.columns.str.startswith('s_')], society_columns))}

df = df.rename(columns=lambda x: x.lstrip('_'), index=str)


In [None]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,BHK,l_5th Phase JP Nagar,l_6th Phase JP Nagar,l_7th Phase JP Nagar,l_8th Phase JP Nagar,l_9th Phase JP Nagar,...,s_Usquaya,s_VBn 2lm,s_VBownre,s_Vaharvi,s_Vaniai,s_ViensRK,s_ViistLa,s_Vrenty,s_Wiowsri,s_others
0,1056.0,2.0,1.0,3907000.0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2600.0,5.0,3.0,12000000.0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,1.0,9500000.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1170.0,2.0,1.0,3800000.0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11,2785.0,5.0,3.0,29500000.0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(7477, 489)

In [None]:
df.isnull().sum()

total_sqft    15
bath           0
balcony        0
price          0
BHK            0
              ..
s_ViensRK      0
s_ViistLa      0
s_Vrenty       0
s_Wiowsri      0
s_others       0
Length: 489, dtype: int64

In [None]:
df=df.dropna()

In [None]:
df.isnull().sum()

total_sqft    0
bath          0
balcony       0
price         0
BHK           0
             ..
s_ViensRK     0
s_ViistLa     0
s_Vrenty      0
s_Wiowsri     0
s_others      0
Length: 489, dtype: int64

In [None]:
df.shape


(7462, 489)

# **Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['price'])
y = df['price']

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# **Model Building**

In [None]:
linear_reg = LinearRegression()
decision_tree = DecisionTreeRegressor(random_state=42)
xgboost = XGBRegressor(random_state=42)

# Train the models
linear_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
xgboost.fit(X_train, y_train)

# **Model Evaluation**

In [None]:
y_pred_lr = linear_reg.predict(X_test)
y_pred_dt = decision_tree.predict(X_test)
y_pred_xgb = xgboost.predict(X_test)

# Evaluate performance
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_dt = mean_squared_error(y_test, y_pred_dt)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

print("Mean Squared Error (Linear Regression):", mse_lr)
print("Mean Squared Error (Decision Tree):", mse_dt)
print("Mean Squared Error (XGBoost):", mse_xgb)

Mean Squared Error (Linear Regression): 17530209639264.615
Mean Squared Error (Decision Tree): 25483189923120.16
Mean Squared Error (XGBoost): 23543621114531.152


In [None]:
from sklearn.metrics import r2_score

# Calculate R^2 score for each model
r2_lr = r2_score(y_test, y_pred_lr)
r2_dt = r2_score(y_test, y_pred_dt)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Print R^2 scores
print("R^2 score (Linear Regression):", r2_lr)
print("R^2 score (Decision Tree):", r2_dt)
print("R^2 score (XGBoost):", r2_xgb)


R^2 score (Linear Regression): 0.7574265713569321
R^2 score (Decision Tree): 0.6473775910489883
R^2 score (XGBoost): 0.6742162806979002


# **Testing Model**

In [None]:
def predict_price(location, society, total_sqft, bath, balcony, BHK, model):
    loc_index = np.where(X.columns == location)[0][0]
    soc_index = np.where(X.columns == society)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = total_sqft
    x[1] = bath
    x[2] = balcony
    x[3] = BHK
    if loc_index >= 0:
        x[loc_index] = 1
    if soc_index >= 0:
        x[soc_index] = 1

    return model.predict([x])[0]




In [None]:
# Example usage:
input_values = {
    'location': 'l_Kenchenahalli',  # Example location
    'society': 's_AriosPa',               # Example society
    'total_sqft': 1000,
    'bath': 2,
    'balcony': 2,                        # Example balcony
    'BHK': 2
}

# Assuming 'linear_reg' is the trained Linear Regression model
predicted_price = predict_price(**input_values, model=linear_reg)
print("Predicted Price:", predicted_price)

Predicted Price: 6229322.470282886




In [None]:
input_values = {
    'location': 'l_Hebbal',  # Example location
    'society': 's_Mahosya',               # Example society
    'total_sqft': 1800,
    'bath': 2,
    'balcony': 2,                        # Example balcony
    'BHK': 3
}

# Assuming 'linear_reg' is the trained Linear Regression model
predicted_price = predict_price(**input_values, model=linear_reg)
print("Predicted Price:", predicted_price)

Predicted Price: 12573399.747189688




# **Frontend using Streamlit**

In [None]:
import pickle

with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(linear_reg, f)


In [None]:
import pandas as pd

# Assuming df is your DataFrame
# Convert DataFrame to JSON string
json_data = df.to_json()

# Save JSON string to a file
with open('data.json', 'w') as f:
    f.write(json_data)


In [None]:
pip install streamlit



In [None]:
%%writefile app.py

Overwriting app.py


In [None]:
! wget -q -O - ipv4.icanhazip.com

35.230.164.41


In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.230.164.41:8501[0m
[0m
[K[?25hnpx: installed 22 in 2.151s
your url is: https://nasty-cycles-dream.loca.lt
/root/.npm/_npx/42856/lib/node_modules/localtunnel/bin/lt.js:81
    throw err;
    ^

Error: connection refused: localtunnel.me:44385 (check your firewall settings)
    at Socket.<anonymous> (/root/.npm/_npx/42856/lib/node_modules/[4mlocaltunnel[24m/lib/TunnelCluster.js:52:11)
[90m    at Socket.emit (events.js:315:20)[39m
[90m    at emitErrorNT (internal/streams/destroy.js:106:8)[39m
[90m    at emitErrorCloseNT (internal/streams/destroy.js:74:3)[39m
[90m    at processTicksAndRejections (internal/process/task_queues.js:80:21)[39m
