### Import all web scraping liberaries

In [23]:
import requests
from bs4 import BeautifulSoup

### Find number of pages

In [26]:
r = requests.get("https://www.otodom.pl/sprzedaz/mieszkanie/warszawa/?dist=0&subregion_id=197&city_id=26&order=filter_float_price%3Aasc&nrAdsPerPage=72&search%5Border%5D=filter_float_price%3Aasc")
soup = BeautifulSoup(r.content, "lxml")

number_of_pages = soup.find("strong", {"class":"current"}).text
number_of_pages = int(number_of_pages)
number_of_pages

683

### Scrape all inforamtion we need
####  *if you have problems with scraping or slow internet connection use included CSV file to load dataframe

In [3]:
l=[]
for page_number in range(1, number_of_pages + 1):
    base_url = f"https://www.otodom.pl/sprzedaz/mieszkanie/warszawa/?dist=0&subregion_id=197&city_id=26&order=filter_float_price%3Aasc&nrAdsPerPage=72&search%5Border%5D=filter_float_price%3Aasc&page="
    
    r = requests.get(base_url + str(page_number))
    soup = BeautifulSoup(r.content, "lxml")
    
    appartments = soup.find_all("div", {"class":"offer-item-details"})
    
    for appartment in appartments:
        
            area = appartment.find("li",{"class","offer-item-area"})
            location = appartment.find("p",{"class", "text-nowrap"})
            price= appartment.find("li",{"class", "offer-item-price"})
            rooms = appartment.find("li",{"class", "offer-item-rooms"})
            
            labels = ['Area', 'Location', 'Price', 'Rooms']
            attrs = [area, location, price, rooms]
            d ={}
            
            for attr, label in iter(zip(attrs, labels)):
                if attr:
                    d[label] = attr.text
                else:
                    d[label] = None

            l.append(d)

len(l)

45173

### Import Pandas and Numpy liberaries

In [3]:
import pandas as pd
import numpy as np

### Create a dataframe with our scraped data

In [32]:
df = pd.DataFrame(l)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df.head(10)

Unnamed: 0,Area,Location,Price,Rooms
0,31 m²,"Mieszkanie na sprzedaż: Warszawa, Wola",\n ...,2 pokoje
1,46 m²,"Mieszkanie na sprzedaż: Warszawa, Wola, Wolska",\n ...,2 pokoje
2,"50,88 m²",Mieszkanie na sprzedaż: Warszawa,\n ...,2 pokoje
3,"64,49 m²","Mieszkanie na sprzedaż: Warszawa, Wola, Wawrz...",\n ...,3 pokoje
4,"46,10 m²","Mieszkanie na sprzedaż: Warszawa, Białołęka, ...",\n ...,2 pokoje
5,"63,10 m²","Mieszkanie na sprzedaż: Warszawa, Białołęka, D...",\n ...,3 pokoje
6,"99,40 m²","Mieszkanie na sprzedaż: Warszawa, Ochota, Bitw...",\n ...,4 pokoje
7,"163,26 m²","Mieszkanie na sprzedaż: Warszawa, Wola, Marci...",\n ...,6 pokoi
8,"133,77 m²","Mieszkanie na sprzedaż: Warszawa, Śródmieście,...",\n ...,3 pokoje
9,1 m²,"Mieszkanie na sprzedaż: Warszawa, Ursynów, Ko...",\n ...,1 pokój


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27719 entries, 0 to 43721
Data columns (total 4 columns):
Area        27719 non-null object
Location    27719 non-null object
Price       27719 non-null object
Rooms       27719 non-null object
dtypes: object(4)
memory usage: 1.1+ MB


### Prepare data

In [34]:
df['Price'] = df['Price'].apply(lambda x: x.replace('\n','').replace(' ', '').replace('zł', '').replace('~', ''))
df['Price'] = df['Price'].apply(lambda x: x.replace(',', '.'))
df['Price'] = df['Price'].apply(lambda x: float(x))

In [35]:
df['Price'].head(10)

0     230000.00
1     306000.00
2     359661.00
3     542304.00
4     296245.98
5     390000.00
6    1099000.00
7       7500.00
8      13549.00
9      20000.00
Name: Price, dtype: float64

In [36]:
df['Rooms'] = df['Rooms'].apply(lambda x: x[0] if x[0] !='>' else 10)
df['Rooms'] = df['Rooms'].apply(lambda x: int(x))
df['Rooms'] = df['Rooms'].apply(lambda x: int(x))

In [37]:
df['Rooms'].head(10)

0    2
1    2
2    2
3    3
4    2
5    3
6    4
7    6
8    3
9    1
Name: Rooms, dtype: int64

In [38]:
df['Area'] = df['Area'].apply(lambda x: x.replace('m²',''))
df['Area'] = df['Area'].apply(lambda x: x.replace(',','.'))
df['Area'] = df['Area'].apply(lambda x: x.replace(' ',''))
df['Area'] = df['Area'].apply(lambda x: float(x))

In [39]:
df['Area'].head(10)

0     31.00
1     46.00
2     50.88
3     64.49
4     46.10
5     63.10
6     99.40
7    163.26
8    133.77
9      1.00
Name: Area, dtype: float64

In [7]:
df['Location'] = df['Location'].apply(lambda x: x.replace('Mieszkanie na sprzedaż: ', ''))

In [21]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode("Warszawa, Śródmieście")
print(location.latitude, location.longitude)
location.address

52.2328098 21.019067


'Śródmieście, Warszawa, mazowieckie, Polska'

In [13]:
df['Location'].head(10)

Unnamed: 0.1,Unnamed: 0,Area,Location,Price,Rooms
0,0,31.00,"Warszawa, Wola",230000.00,2
1,1,46.00,"Warszawa, Wola, Wolska",306000.00,2
2,2,50.88,Warszawa,359661.00,2
3,3,64.49,"Warszawa, Wola, Wawrzyszewska",542304.00,3
4,4,46.10,"Warszawa, Białołęka, Aleksandra Kamińskiego 1",296245.98,2
5,5,63.10,"Warszawa, Białołęka, Derby 1, Skarbka z gór",390000.00,3
6,6,99.40,"Warszawa, Ochota, Bitwy Warszawskiej 1920 r.",1099000.00,4
7,7,163.26,"Warszawa, Wola, Marcina Kasprzaka",7500.00,6
8,8,133.77,"Warszawa, Śródmieście, Próżna",13549.00,3
9,9,1.00,"Warszawa, Ursynów, Komisji Edukacji Narodowej",20000.00,1


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27719 entries, 0 to 43721
Data columns (total 4 columns):
Area        27719 non-null float64
Location    27719 non-null object
Price       27719 non-null float64
Rooms       27719 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.1+ MB


In [4]:
df.to_csv('Warsaw_appartment_df')

NameError: name 'df' is not defined

In [17]:
df = df[df['Area'] < 300]
df.head(10)

Unnamed: 0.1,Unnamed: 0,Area,Location,Price,Rooms
0,0,31.0,"Warszawa, Wola",230000.0,2
1,1,46.0,"Warszawa, Wola, Wolska",306000.0,2
2,2,50.88,Warszawa,359661.0,2
3,3,64.49,"Warszawa, Wola, Wawrzyszewska",542304.0,3
4,4,46.1,"Warszawa, Białołęka, Aleksandra Kamińskiego 1",296245.98,2
5,5,63.1,"Warszawa, Białołęka, Derby 1, Skarbka z gór",390000.0,3
6,6,99.4,"Warszawa, Ochota, Bitwy Warszawskiej 1920 r.",1099000.0,4
7,7,163.26,"Warszawa, Wola, Marcina Kasprzaka",7500.0,6
8,8,133.77,"Warszawa, Śródmieście, Próżna",13549.0,3
9,9,1.0,"Warszawa, Ursynów, Komisji Edukacji Narodowej",20000.0,1


TODO:

-Add long/lat column based on location


### You can load df from file here if you have problem with scraping 

In [20]:
df = pd.read_csv('Warsaw_appartment_df', index_col=0)
df

Unnamed: 0,Area,Location,Price,Rooms
0,31.00,"Warszawa, Wola",230000.00,2
1,46.00,"Warszawa, Wola, Wolska",306000.00,2
2,50.88,Warszawa,359661.00,2
3,64.49,"Warszawa, Wola, Wawrzyszewska",542304.00,3
4,46.10,"Warszawa, Białołęka, Aleksandra Kamińskiego 1",296245.98,2
5,63.10,"Warszawa, Białołęka, Derby 1, Skarbka z gór",390000.00,3
6,99.40,"Warszawa, Ochota, Bitwy Warszawskiej 1920 r.",1099000.00,4
7,163.26,"Warszawa, Wola, Marcina Kasprzaka",7500.00,6
8,133.77,"Warszawa, Śródmieście, Próżna",13549.00,3
9,1.00,"Warszawa, Ursynów, Komisji Edukacji Narodowej",20000.00,1


### Import all Data-Visualisation liberaries

In [45]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
sns.regplot(x='Price', y='Area', data=df)
plt.tight_layout()

NameError: name 'sns' is not defined