4 Steps in WebScraping :

-->Sending a HTTP GET request to the url of the webpage by using requests library

-->Fetching and Parsing the data using BeautifulSoup and maintain the data in some dicts/lists

-->Analyzing the HTML tags and their attributes

-->Output the data in any file format(.csv,.xlsx,.json)

In [1]:
#!pip install requests

In [2]:
#!pip install bs4

In [3]:
import requests
import bs4
#help(bs4)

In [4]:
from bs4 import BeautifulSoup
#connect to the url
url = "https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city"
data = requests.get(url)
data

<Response [200]>

In [5]:
#This is where BeautifulSoup comes into action by getting the data
soup = BeautifulSoup(data.content,'html')
#soup

In [6]:
#print(soup.prettify())

In [7]:
#extracting Property name
soup.find('div',attrs={'class':'title-line'}).text

'Residential Plot in Himaja Elite Vistas'

In [8]:
soup.find('a',attrs={'class':'typelink'}).span.text.strip()

'Residential Plot'

In [9]:
soup.find('span',attrs={'itemprop':'addressLocality'}).text

'Kesarapalli'

In [10]:
#find the area and total price
soup.find('div',attrs={'data-type':'price-link'}).text.strip()

'2.52 Cr'

In [11]:
soup.find('td',attrs={'class':'size'}).text.strip()

'7200'

In [12]:
soup.find('td',attrs={'class':'val'}).text

'New'

In [13]:
soup.find('ul',attrs={'class':'listing-details'}).text

'1 - 2 years old'

In [14]:
#now we will extract details from the entire first page
#find_all()
#soup.find_all('div',attrs={'class':'title-line'}).text

In [15]:
#as find_all() to be used for multiple entries we go for iterations
a = soup.find_all('div',attrs={'class':'title-line'})
names = [] #this list will have property names
for i in a:
    #print(i.text)
    names.append(i.text)
print(names)
print(len(names))

['Residential Plot in Himaja Elite Vistas', 'Residential Plot in Himaja Elite Vistas', '2 BHK Apartment in Himaja Sri Hemadurga Jewel County', '3 BHK Apartment', 'Residential Plot', '2 BHK Apartment', '2 BHK Independent House', '2 BHK Apartment in Hycon Elite', '3 BHK Apartment', '3 BHK Independent Floor', '3 BHK Apartment', '2 BHK Apartment', '2 BHK Independent House', '3 BHK Independent Floor', '6 BHK Independent House', 'Residential Plot in Harivillu Fortune Legendary', 'Residential Plot', 'Residential Plot', '3 BHK Apartment in Pooja Tree Storey', '3 BHK Apartment in Sri Naga Sun Rise Enclave']
20


In [16]:
b = soup.find_all('span',attrs={'itemprop':'addressLocality'})
#optimized way -->List Comprehension
#syntax is --> [exprsn for var in collection/function]
places = [i.text for i in b]
print(places)
print(len(places))

['Kesarapalli', 'Kesarapalli', 'Gannavaram', 'Poranki', 'Kankipadu', 'Poranki', 'Kankipadu', 'Poranki', 'Gannavaram', 'Gunadala', 'Gollapudi', 'Enikepadu', 'Benz Circle', 'Vidhyadharpuram', 'Penamaluru', 'Kankipadu', 'Poranki', 'Kankipadu', 'Benz Circle', 'Gannavaram']
20


In [17]:
#extracting price
c = soup.find_all('div',attrs={'data-type':'price-link'})
d = [i.text.strip() for i in c]
print(d)
print(len(d))

['2.52 Cr', '68.04 L', '45.57 L', '55.5 L', '15.5 L', '48 L', '50 L', '40 L', '59.86 L', '56 L', '89 L', '39 L', '59 L', '95 L', '1.1 Cr', '16.83 L', '33 L', '20 L', '1.66 Cr', '72 L']
20


In [18]:
#Now we need to remove 'L' and 'Cr' from the above price
#As prices are in Lakhs and Crores we start removing them
Price = []
for i in c:
    i = i.text.strip()
    if "Cr" in i:
        i = i.replace(" Cr",'') #replacing with empty string
        i = float(i) * 100 #converting into lakhs
    else:
        i = i.replace(" L","")
        i = float(i)
    Price.append(i)
print(Price)
print(len(Price))

[252.0, 68.04, 45.57, 55.5, 15.5, 48.0, 50.0, 40.0, 59.86, 56.0, 89.0, 39.0, 59.0, 95.0, 110.00000000000001, 16.83, 33.0, 20.0, 166.0, 72.0]
20


In [19]:
d = soup.find_all('td',attrs={'class':'size'})
area = [i.text.strip() for i in d]
print(area)
print(len(area))

['7200', '1944', '1266', '1587', '1560', '1450', '1290', '1200', '1663', '1500', '2062', '1250', '1350', '1600', '3700', '1782', '1800', '1500', '2317', '1650']
20


In [20]:
#Getting the house type along with information about bathrooms/facing
property_types = ['Apartment','Builder Floor','Villa',
                  'Residential Plot','Independent House',
                  'Independent Floor','Studio Apartment']
f = soup.find_all('div',attrs={'class':'title-line'})
Type = []
for i in f:
    i = i.text
    for p_type in property_types:
        if p_type in i:
            Type.append(p_type)
print(Type)
print(len(Type))

['Residential Plot', 'Residential Plot', 'Apartment', 'Apartment', 'Residential Plot', 'Apartment', 'Independent House', 'Apartment', 'Apartment', 'Independent Floor', 'Apartment', 'Apartment', 'Independent House', 'Independent Floor', 'Independent House', 'Residential Plot', 'Residential Plot', 'Residential Plot', 'Apartment', 'Apartment']
20


In [21]:
#Now for the number of BHK we will replace Residential plot with '0'
f = soup.find_all('div',attrs={'class':'title-line'})
Rooms = []
for i in f:
    i = i.span.text
    #print(i)
    i = i.replace("Residential Plot",'0').replace(' ','')
    Rooms.append(i)
print(Rooms)
print(len(Rooms))

['0', '0', '2', '3', '0', '2', '2', '2', '3', '3', '3', '2', '2', '3', '6', '0', '0', '0', '3', '3']
20


In [22]:
#checking status type
s = soup.find_all('td',attrs={'class':'val'})
status = [i.text for i in s]
print(status)
print(len(status))

['New', 'New', 'Ready to move', 'Ready to move', 'Resale', 'Ready to move', 'Under Construction', 'Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Under Construction', 'Ready to move', 'Resale', 'Resale', 'Resale', 'Under Construction', 'Ready to move']
20


In [23]:
#details of the property
d = soup.find_all('ul',attrs={'class':'listing-details'})
details = [i.text for i in d]
print(details)
print(len(details))

['1 - 2 years old', '1 - 2 years old', '3 - 4 years old2 BathroomsNew ', '3 BathroomsResale West facing', '7 - 8 years old2 open sidesEast facing', '2 BathroomsResale ', '2 BathroomsResale NorthEast facing', '2 BathroomsResale ', '3 BathroomsResale East facing', '3 BathroomsResale ', '3 BathroomsResale ', '2 BathroomsResale ', '2 BathroomsResale East facing', '3 BathroomsResale West facing', '7 - 8 years old7 BathroomsResale ', '1 - 2 years old', '1 open sidesWest facing', '7 - 8 years old', '3 BathroomsNew East facing', '2 - 3 years old3 BathroomsNew ']
20


In [24]:
#So we finally separate Bathrooms and Type of Facing listings
#Regular Expressions -->re
import re #pattern matching
d = soup.find_all('ul',attrs={'class':'listing-details'})
Bathrooms = []
Facing = []
for i in d:
    i = i.text
    bathroom_count = re.findall(r'(\d+) Bathrooms',i)#\d -->digits
    if bathroom_count:
        Bathrooms.append(int(bathroom_count[0]))
    else:
        Bathrooms.append(0)
    facing_direction = re.findall(r'(North|South|East|West|NorthEast|NorthWest|SouthEast|SouthWest) facing',
                                  i)
    if facing_direction:
        Facing.append(facing_direction[0])
    else:
        Facing.append('None')
print(Facing)
print(Bathrooms)

['None', 'None', 'None', 'West', 'East', 'None', 'NorthEast', 'None', 'East', 'None', 'None', 'None', 'East', 'West', 'None', 'None', 'West', 'None', 'East', 'None']
[0, 0, 2, 3, 0, 2, 2, 2, 3, 3, 3, 2, 2, 3, 7, 0, 0, 0, 3, 3]


In [25]:
#now we will use the same above logic to extract for the entire page
#ipython -->Interactive Python
#!pip install ipython

In [26]:
base_url = "https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page="
for i in range(1,41):
    url = base_url+str(i)
    print(url)

https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=1
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=2
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=3
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=4
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=5
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=6
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=7
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=8
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=9
https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page=10
https://www.makaan.com/vijayawada-residential-property/buy-

In [27]:
#pip install ipython
from IPython.display import clear_output
import time,re,requests
from bs4 import BeautifulSoup

## Scraping from multiple pages from the website

In [31]:
Bedrooms = [];Bathrooms = []
Location = [];Price = []
Size = [];Status = []
Facing = [];Type = []

property_types = ['Apartment', 'Independent House',
                  'Independent Floor',
                  'Villa', 'Studio Apartment',
                  'Residential Plot', 'Builder Floor']

for i in range(1, 54):
  url = f'https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page={i}'
  print(f'Scraping Info from Page --> {i}')
  clear_output(wait=True)
  time.sleep(3) #waiting time to move from one page to other

  d = requests.get(url)
  soup = BeautifulSoup(d.content, 'html')

  a = soup.find_all('a', attrs={'class':'typelink'})
  for i in a:
    i = i.span.text #you need only number which is present in start
    i = i.replace('Residential Plot', '0').replace(' ', '')
    Bedrooms.append(i)

  b = soup.find_all('span', attrs={'itemprop':'addressLocality'})
  for i in b:
    Location.append(i.text)

  c = soup.find_all('div', attrs={'data-type':'price-link'})
  for i in c:
    i = i.text.strip()
    if 'Cr' in i:
      i = i.replace(' Cr', '')
      i = float(i) * 100 #converting into Lakhs
    else:
      i = i.replace(' L', '')
      i = float(i)
    Price.append(i)

  #Extracting Area
  d = soup.find_all('td', attrs={'class':'size'})
  for i in d:
    i = i.text.strip()
    Size.append(i)

  #Extracting construction status
  e = soup.find_all('td', attrs={'class':'val'})
  for i in e:
    i = i.text
    Status.append(i)

  #Extracting Bathrooms and Facing Type
  f = soup.find_all('ul', attrs={'class':'listing-details'})
  for i in f:
    i = i.text
    bathroom_count = re.findall(r'(\d) Bathrooms', i)
    if bathroom_count:
      Bathrooms.append(int(bathroom_count[0]))
    else:
      Bathrooms.append('0')
    facing_direction = re.findall(r'(North|South|East|West|NorthEast|NorthWest|SouthEast|SouthWest) facing', i)
    if facing_direction:
      Facing.append(facing_direction[0])
    else:
      Facing.append('None')

  g = soup.find_all('div', attrs={'class':'title-line'})
  for i in g:
    i = i.text
    for p_type in property_types:
      if p_type in i:
        Type.append(p_type)

ConnectionError: HTTPSConnectionPool(host='www.makaan.com', port=443): Max retries exceeded with url: /vijayawada-residential-property/buy-property-in-vijayawada-city?page=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000018A2DDEE9E0>: Failed to resolve 'www.makaan.com' ([Errno 11001] getaddrinfo failed)"))

In [None]:
print(len(Bathrooms))
print(len(Bedrooms))
print(len(Status))
print(len(Size))
print(len(Location))
print(len(Price))
print(len(Facing))
print(len(Type))

In [None]:
#let's create a dataframe from above data
data_dict = {'Bedrooms':Bedrooms,
            'Bathrooms':Bathrooms,
            'Status':Status,
            'Size':Size,
            'Location':Location,
            'Price':Price,
            'Facing':Facing,
            'Type':Type}
#data_dict

In [None]:
#Now as we have different values in all columns we write a logic
#to create a dataframe keeping max_length as 1070 in above case
import pandas as pd
max_length = 1071 #dependng on your data scraped above

for key in data_dict.keys():
    data_dict[key] += [None] * (max_length - len(data_dict[key]))

data = pd.DataFrame(data_dict)
data
#data.isnull().sum()

In [None]:
data.isnull().sum()

In [None]:
#we have created missing values in all other columns so we drop those values
data.dropna(inplace=True)

In [None]:
data.info()

In [None]:
#once we get the final data we perform descriptive statistical information and then check for
#outliers
type(data)
#data.describe()
#data.describe(include="all")
data.describe(include='all').T

In [None]:
#before getting into removal of outliers let's have a
#complete overview of all columns using plotly
import plotly.express as px
data.columns

In [None]:
#px.box(data) #As every column is having different type of data
#we need to convert those categorical values to numerical values

In [None]:
data['Status'].unique()

In [None]:
#data['Location'].unique()
data['Location'].nunique()

In [None]:
data['Facing'].unique()

In [None]:
data['Type'].unique()

In [None]:
#we use sklearn for feature encoding
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [None]:
names = ['Saketh','Codegnan','Obaid','Kesava']
marks = [85,100,65,75]
d = {'Names':names,'Marks':marks}
d

In [None]:
label.fit(d['Names']) # it fits the values and assign a numerical data

In [None]:
label.transform(d['Names']) # it shows the numerical value of categorical data

In [None]:
d['Names'] = label.fit_transform(d['Names'])  # the combination of fit and transform function is fit_transform
d

In [None]:
data.columns

In [None]:
data.info()

In [None]:
#Converting object dtype into int dtype
data['Bathrooms'] = data['Bathrooms'].astype('int32')
data['Bedrooms'] = data['Bedrooms'].astype('int32')
data['Size'] = data['Size'].astype('int32')

In [None]:
data.info()

In [None]:
for col in data.columns:
  if data[col].dtype == 'object':
    data[col] = label.fit_transform(data[col])
    encoded_to_original = dict(zip(label.transform(label.classes_),
                                  label.classes_))
    print(col,' = {')
    for encoded_label, original_value in encoded_to_original.items():
      print(f'"{original_value}" : {encoded_label}',end=',')
    print('}')
    print('----------------------------------------------------------------')

In [None]:
data.info()

In [None]:
# import plotly Express to check the outiers in data using boxplot.
import plotly.express as px

In [None]:
px.box(data)

In [None]:
#data.info()
px.box(data,x='Facing')

In [None]:
#Removing Outliers from the dataframe -->we will create a final dataframe
def remove_outliers(df, threshold=3):
    df_cleaned = df.copy() #copy of dataframe
    #The threshold value determines how far from the first and third quartiles a data point must be to be considered an outlier
    for col in df_cleaned.select_dtypes(include=['int64',
                                               'float64']).columns:
        Q1 = df_cleaned[col].quantile(0.25) #25th percentile
        Q3 = df_cleaned[col].quantile(0.75) #75th percentile
        IQR = Q3 - Q1 #InterQuantile Range
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        #Filter the DataFrame to keep only the rows where the
        #column values are within the calculated bounds,
        #effectively removing outliers
        df_cleaned = df_cleaned[
            (df_cleaned[col] >= lower_bound) &
            (df_cleaned[col] <= upper_bound)
        ]
    return df_cleaned

In [None]:
def outliers_percentage_check(df, threshold=3):
    df_cleaned = df.copy()

    total_outliers = 0  # Initialize a variable to count total outliers.

    for col in df_cleaned.select_dtypes(include=['int64', 'float64']).columns:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)

        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)

        total_outliers += outliers.sum()  # Count outliers in the current column.

        df_cleaned = df_cleaned[~outliers]

    percentage_outliers = (total_outliers / df.shape[0]) * 100  # Calculate the percentage of outliers.

    return df_cleaned, percentage_outliers


In [None]:
# checking the outliers Percentage
outliers_percent = outliers_percentage_check(data)
outliers_percent[1]

In [None]:
df = remove_outliers(data) #  removing the outliers from the data
df

In [None]:
#We will create a csv file from the above data
#df.info()
df.to_csv("House.csv",index=False)
df.info()

In [None]:
# Reading the converted dataset using pandas
df = pd.read_csv('House.csv')
df.head()

In [None]:
df.info()

In [None]:
data.columns

In [None]:
#We will get into Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
#divide our data into dependent and independent variables
x = data.drop(columns=['Price'])
y = data['Price']

In [None]:
#let's understand about splitting the data by understanding
#below example
import numpy as np
a,b = np.arange(10).reshape(5,2),range(5)

In [None]:
#print(a)
#print(b)

In [None]:
w,p,q,z=train_test_split(a,b) #as there is no consistency in data
print(w)
print(p)
print(q)
print(z)

In [None]:
w,p,q,z=train_test_split(a,b,random_state=1) # now there is consistency in data
print(w)
print(p)
print(q)
print(z)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,
                                                random_state=42)

In [None]:
print(len(x_train))
print(len(x_test))

In [None]:
#x_train

In [None]:
#model creation
model = LinearRegression()

In [None]:
#fit the data to the model(training data)
model.fit(x_train,y_train)

In [None]:
#we will check the performance metrics -->R-SQuared value,mean_squared erro
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
#checking for predictions and performance metrics
y_pred = model.predict(x_test)

In [None]:
print("Linear Regression Metrics:")
print("R-Squared value is",r2_score(y_test,y_pred))
print("Mean Squared Error is",mean_squared_error(y_test,y_pred))

In [None]:
data.info()

In [None]:
#let's pass some inputs and validate it then will create pickle file
#from our finetuned model
bed = int(input('Enter no of bedrooms:'))
bath = int(input('Enter no of bathrooms:'))
loc = int(input('Enter Location:'))
size = int(input('Enter size:'))
status = int(input('Enter status:'))
facing = int(input('Enter facing:'))
Type = int(input('Enter Type:'))

In [None]:
3#now we will test for the above data
import numpy as np
input_data = np.array([[bed,bath,loc,size,status,facing,Type]])
result = model.predict(input_data)[0]
print(f'House Price Predicted is {result} Lakhs')

In [None]:
#we will create a pickle file for our finally trained model
import pickle

In [None]:
#open() function for creating our new files and also reading
with open("House.pkl",'wb') as f:
    pickle.dump(model,f)

In [None]:
#we will use our pickle file for predictions
predicted_model = pickle.load(open('House.pkl','rb'))

In [None]:
predicted_model.predict(input_data)[0].round(2)

In [30]:
#No Free Lunch Theorem -->No one model perfectly fits the data
#Finally we will present our model-->we can use flask as webservice -->AWS,Pythonanywhere
#House Price Prediction-->  Streamlit directly
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.29.0-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.7.0-py3-none-any.whl.metadata (1.9 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting importlib-metadata<7,>=1.4 (from streamlit)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting protobuf<5,>=3.20 (from streamlit)
  Downloading protobuf-4.25.1-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Collecting pyarrow>=6.0 (from streamlit)
  Downloading pyarrow-14.0.2-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)
Collect



