In [3]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import pandas as pd
#This tool is used for automating web-based application testing to verify that it performs expectedly.
#Beautiful Soup is a Python package for parsing HTML and XML documents (including having malformed markup, i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping.
#The requests module allows you to send HTTP requests using Python

In [4]:
driver = webdriver.Chrome()
url = 'https://www.pararius.com/apartments/amsterdam?ac=1'
response = requests.get(url)
driver.maximize_window()
driver.get(url)

time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content, "html.parser")
lists = soup.find_all('section', class_="listing-search-item")


In [5]:
title = []
location = []
price = []
area = []

In [6]:
for list in lists:

    if list.find('a', class_="listing-search-item__link listing-search-item__link--title"):
        title.append(list.find('a', class_="listing-search-item__link listing-search-item__link--title").text.replace('\n', ''))
    else:
        title.append('')

    if list.find('div', class_="listing-search-item__sub-title"):
        location.append(list.find('div', class_="listing-search-item__sub-title").text.replace('\n', ''))
    else:
        location.append('')

    if list.find('div', class_="listing-search-item__price"):
        price.append(list.find('div', class_="listing-search-item__price").text.replace('\n', ''))
    else:
        price.append('')

    if list.find('ul', class_="illustrated-features illustrated-features--compact"):
        area.append(list.find('ul', class_="illustrated-features illustrated-features--compact").text.replace('\n', ''))
    else:
        area.append('')



In [7]:
data = pd.DataFrame({'title':title, 'location':location, 'price':price, 'area':area})
pd.set_option('display.max_columns', None)
print(data)


                                                title  \
0                           Apartment Deurloostraa...   
1                           Apartment Prins Hendri...   
2                           Apartment Johannes Mee...   
3                           Apartment Krommertstra...   
4                           Apartment Krommertstra...   
5                           Apartment Krommertstra...   
6                           Apartment Krommertstra...   
7                           Apartment Govert Flinc...   
8                           Apartment Maria Austri...   
9                           Apartment Apollolaan  ...   
10                          Apartment De Wittenkad...   
11                          Apartment Lindengracht...   
12                          Apartment Bentinckstra...   
13                          Apartment Govert Flinc...   
14                          Apartment Frans van Mi...   
15                          Apartment Da Costakade...   
16                          Apa

In [8]:

data[['area','ROOM INFO']] = data['area'].str.split('m²',expand=True)


In [9]:
print(data)

                                                title  \
0                           Apartment Deurloostraa...   
1                           Apartment Prins Hendri...   
2                           Apartment Johannes Mee...   
3                           Apartment Krommertstra...   
4                           Apartment Krommertstra...   
5                           Apartment Krommertstra...   
6                           Apartment Krommertstra...   
7                           Apartment Govert Flinc...   
8                           Apartment Maria Austri...   
9                           Apartment Apollolaan  ...   
10                          Apartment De Wittenkad...   
11                          Apartment Lindengracht...   
12                          Apartment Bentinckstra...   
13                          Apartment Govert Flinc...   
14                          Apartment Frans van Mi...   
15                          Apartment Da Costakade...   
16                          Apa

In [10]:
data[['ROOM INFO','ROOMS AVAILABLE']] = data['ROOM INFO'].str.split(' ',1,expand=True)

In [11]:
print(data)

                                                title  \
0                           Apartment Deurloostraa...   
1                           Apartment Prins Hendri...   
2                           Apartment Johannes Mee...   
3                           Apartment Krommertstra...   
4                           Apartment Krommertstra...   
5                           Apartment Krommertstra...   
6                           Apartment Krommertstra...   
7                           Apartment Govert Flinc...   
8                           Apartment Maria Austri...   
9                           Apartment Apollolaan  ...   
10                          Apartment De Wittenkad...   
11                          Apartment Lindengracht...   
12                          Apartment Bentinckstra...   
13                          Apartment Govert Flinc...   
14                          Apartment Frans van Mi...   
15                          Apartment Da Costakade...   
16                          Apa

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            32 non-null     object
 1   location         32 non-null     object
 2   price            32 non-null     object
 3   area             32 non-null     object
 4   ROOM INFO        32 non-null     object
 5   ROOMS AVAILABLE  32 non-null     object
dtypes: object(6)
memory usage: 1.6+ KB


In [13]:
data.head(5)

Unnamed: 0,title,location,price,area,ROOM INFO,ROOMS AVAILABLE
0,Apartment Deurloostraa...,1078 JJ Amsterdam (Scheldebuurt) ...,"€2,495 per month",80,3,roomsFurnished
1,Apartment Prins Hendri...,1011 AN Amsterdam (Nieuwmarkt/Last...,"€3,250 per month",117,3,roomsFurnished
2,Apartment Johannes Mee...,1063 CJ Amsterdam (Slotermeer-Noor...,"€1,250 per month",54,3,roomsShell
3,Apartment Krommertstra...,1056 TS Amsterdam (Geuzenbuurt) ...,"€1,800 per month",33,2,roomsUpholstered
4,Apartment Krommertstra...,1056 TS Amsterdam (Geuzenbuurt) ...,"€1,900 per month",36,2,roomsUpholstered


In [14]:
data.count()

title              32
location           32
price              32
area               32
ROOM INFO          32
ROOMS AVAILABLE    32
dtype: int64

In [15]:
data.isnull().sum()

title              0
location           0
price              0
area               0
ROOM INFO          0
ROOMS AVAILABLE    0
dtype: int64

In [16]:
data.isna().sum().sum()

0

In [17]:
print(data.count())
data1=data.drop_duplicates()
print(data.count())

title              32
location           32
price              32
area               32
ROOM INFO          32
ROOMS AVAILABLE    32
dtype: int64
title              32
location           32
price              32
area               32
ROOM INFO          32
ROOMS AVAILABLE    32
dtype: int64


In [18]:
data.groupby(['location','ROOM INFO']).size()

location                                                      ROOM INFO
            1011 AN Amsterdam (Nieuwmarkt/Lastage)            3            1
            1013 GM Amsterdam (Haarlemmerbuurt)               3            1
            1015 KN Amsterdam (Jordaan)                       2            1
            1025 HZ Amsterdam (Buikslotermeer)                4            1
            1051 GD Amsterdam (Staatsliedenbuurt)             2            1
            1052 AE Amsterdam (Staatsliedenbuurt)             2            1
            1053 DD Amsterdam (Van Lennepbuurt)               2            1
            1053 WT Amsterdam (Da Costabuurt)                 3            1
            1055 AA Amsterdam (De Kolenkit)                   1            1
            1055 CA Amsterdam (Erasmuspark)                   3            1
            1056 SX Amsterdam (Van Galenbuurt)                4            1
            1056 TS Amsterdam (Geuzenbuurt)                   2            4
    

In [19]:
data.groupby(['price','ROOM INFO']).size()

price                                         ROOM INFO
                €1,250 per month              3            1
                €1,450 per month              1            1
                €1,500 per month              2            1
                €1,575 per month              3            1
                €1,625 per month              2            1
                €1,650 per month              2            1
                €1,750 per month              2            3
                €1,790 per month              2            1
                €1,800 per month              2            1
                €1,850 per month              2            1
                                              4            1
                €1,900 per month              2            3
                €1,950 per month              2            1
                €2,000 per month              4            1
                €2,100 per month              3            1
                €2,250 per mo

In [20]:
#To add IP in mongoDB
import socket
hostname = socket.gethostname()
IP = socket.gethostbyname(hostname)
print(IP)
print(hostname)

192.168.1.45
DESKTOP-MFP1R0R


In [21]:
!pip install pymongo



In [22]:
import pymongo
client = pymongo.MongoClient("mongodb+srv://RamDbs:RamDbs123@cluster0.diqp9ef.mongodb.net/?retryWrites=true&w=majority")

In [23]:
df =data.to_dict(orient = "records")

In [24]:
df

[{'title': '                        Apartment Deurloostraat                            ',
  'location': '            1078 JJ Amsterdam (Scheldebuurt)        ',
  'price': '                €2,495 per month            ',
  'area': '80 ',
  'ROOM INFO': '3',
  'ROOMS AVAILABLE': 'roomsFurnished'},
 {'title': '                        Apartment Prins Hendrikkade                            ',
  'location': '            1011 AN Amsterdam (Nieuwmarkt/Lastage)        ',
  'price': '                €3,250 per month            ',
  'area': '117 ',
  'ROOM INFO': '3',
  'ROOMS AVAILABLE': 'roomsFurnished'},
 {'title': '                        Apartment Johannes Meewisstraat                            ',
  'location': '            1063 CJ Amsterdam (Slotermeer-Noordoost)        ',
  'price': '                €1,250 per month            ',
  'area': '54 ',
  'ROOM INFO': '3',
  'ROOMS AVAILABLE': 'roomsShell'},
 {'title': '                        Apartment Krommertstraat                            '

In [25]:
#Database name isApartments
database =client["Apartments"]

In [26]:
print(database)

Database(MongoClient(host=['ac-wnookl7-shard-00-01.diqp9ef.mongodb.net:27017', 'ac-wnookl7-shard-00-02.diqp9ef.mongodb.net:27017', 'ac-wnookl7-shard-00-00.diqp9ef.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-dpdx61-shard-0', tls=True), 'Apartments')


In [27]:
database.AmsterdamApartments.insert_many(df)

<pymongo.results.InsertManyResult at 0x2846869c820>