# Iphone 7 offers scrape from https://allegro.pl

### Scraping data from website content using Selenium and BeautifulSoup

In [1]:
from selenium.webdriver import Chrome, ChromeOptions
from bs4 import BeautifulSoup
from tqdm import tqdm

#### Note: to use chrome driver you need to download chromedriver:
https://sites.google.com/a/chromium.org/chromedriver/downloads
#### and then put path to this file

In [2]:
# Disable Loading Images in chrome
options = ChromeOptions()
prefs = {'profile.managed_default_content_settings.images':2}
options.add_experimental_option("prefs",prefs)

driver = Chrome('/Users/sadixd/chromedriver', chrome_options=options)

In [3]:
link = 'https://allegro.pl/kategoria/apple-iphone-7-256094?order=m&offerTypeBuyNow=1'
page = '&p='

driver.get(link)
soup = BeautifulSoup(driver.page_source, 'lxml')

In [4]:
number_of_pages = int(soup.find('a', {'rel':'last'}).text)

In [5]:
print(f'Number of pages to scrape: {number_of_pages}')

Number of pages to scrape: 17


In [6]:
all_offers = []

In [7]:
for page_number in tqdm(range(1, number_of_pages + 1)):
    driver.get(link + page + str(page_number))
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    page_offers = soup.find_all('article')
    
    for offer in page_offers:
        title = offer.find('h2').text
        price = offer.find('span', {'class': 'e82f23a'}).text
        additional_info = offer.find_all('dd')
#         condition = additional_info[1].text
        memory = additional_info[-1].text
        
        d = {'Title': title, 'Price': price, 'Memory': memory}
        all_offers.append(d)

100%|██████████| 17/17 [00:25<00:00,  1.49s/it]


In [8]:
print(f'Scraped {len(all_offers)} offers')

Scraped 1019 offers


### Prepare Data

In [9]:
import pandas as pd

In [10]:
df = pd.DataFrame(all_offers)

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877 entries, 0 to 1016
Data columns (total 3 columns):
Memory    877 non-null object
Price     877 non-null object
Title     877 non-null object
dtypes: object(3)
memory usage: 27.4+ KB


In [13]:
df['Price'] = df['Price'].apply(lambda x: float(x.replace('zł', '').replace(',', '.').replace(' ', '')))
df = df[df['Price'] > 500]

In [14]:
df = df[df['Memory'].apply(lambda x: x.count('GB')) == 1]
df['Memory'] = df['Memory'].apply(lambda x: int(x.replace(' GB', '')))
df = df[df['Memory'] > 2]

In [15]:
df.head()

Unnamed: 0,Memory,Price,Title
0,256,2899.0,Apple iPhone 7 256GB Black/Silver/Gold/Rose Gold
1,32,2269.0,iCenter IPHONE 7 32GB 5 KOLORÓW GRTISY GWAR 12M
2,32,2279.0,Apple iPhone 7 32GB 4 Kolory Kurier 24h Gwarancja
3,32,2449.0,Apple iPhone 7 32GB Black Kur24h Gw12m GRATISY
5,128,2649.0,iCenter iPHONE 7 128 GB 5 KOLORÓW GRTISY GWAR 12M


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 815 entries, 0 to 1016
Data columns (total 3 columns):
Memory    815 non-null int64
Price     815 non-null float64
Title     815 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 25.5+ KB


In [17]:
df.corr()

Unnamed: 0,Memory,Price
Memory,1.0,0.454599
Price,0.454599,1.0


In [18]:
df.describe()

Unnamed: 0,Memory,Price
count,815.0,815.0
mean,111.548466,2719.984454
std,81.558,631.304589
min,32.0,1280.0
25%,32.0,2400.0
50%,128.0,2659.0
75%,128.0,2900.0
max,256.0,12797.99
