In [68]:
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
from collections import defaultdict
import json
import csv

In [5]:
urls = []
r = requests.get("https://www.apple.com/shop/refurbished/mac/macbook-pro")
soup = bs(r.content, "html.parser")
ads = soup.find("div",{"class":"refurbished-category-grid-no-js"})
for a in ads.find_all('a', href=True):
    urls.append("https://www.apple.com" + a['href'])
    
len(urls)

165

In [36]:
def get_specs(soup):
    
    specs = defaultdict(list)
    section = soup.find("div",{"class":"as-productinfosection-panel TechSpecs-panel row"})
    for cat in section.select('.h4-para-title'):
        k = cat.text.strip()
        for item in cat.find_next_siblings():
            if item.name != 'div':
                break
            else:
                specs[k.lower()].append(item.text.strip().lower())
        
    return dict(specs)

In [37]:
def get_price(soup):
    
    price = soup.find("div",{"class":"as-price-currentprice as-pdp-currentprice as-pdp-refurbishedprice"})
    price = price.findAll('span')[0]
    price = price.getText().replace("\n", "").strip()
    price = price.replace('$', '').replace(',', '')
    price = float(price)
    
    return price

In [38]:
def get_date(soup):
    
    specs = soup.find("div",{"class":"as-productinfosection-mainpanel column large-9 small-12"})
    for tag in specs.findAll('p'):
        parsed = tag.getText()
        if 'released' in parsed:
            date = parsed.replace("\n", "").strip().lower()
            break
        else:
            date = ""
            
    return date

In [39]:
def get_screen(soup):
    
    specs = soup.find("div",{"class":"as-productinfosection-mainpanel column large-9 small-12"})
    for tag in specs.findAll('p'):
        parsed = tag.getText()
        if '-inch' in parsed.lower() and not parsed.startswith('http'):
            screen = parsed.replace("\n", "").strip().lower()
            break
        else:
            screen = ""
            
    return screen

In [40]:
def get_details(soup):
    
    specs = get_specs(soup)
    specs['price'] = get_price(soup)
    specs['date'] = get_date(soup)
    specs['screen'] = get_screen(soup).strip().lower()

    return specs

In [41]:
data = []
for i, url in enumerate(urls):
    r = requests.get(url)
    if r.status_code == 200:
        soup = bs(r.content, "html.parser")
        specs = get_details(soup)
        specs['url'] = url
        data.append(specs)
    else:
        print(i)

In [42]:
with open('refurbished_macs.json', 'w') as f:
    json.dump(data, f)

In [43]:
with open('refurbished_macs.json') as f:
    data = json.load(f)

In [63]:
clean = []
for line in data:
    if 'macbook-pro' not in line['url'].lower():
        pass
    else:
        row = {
            'url': line['url'],
            'date': ' '.join(line['date'].split(' ')[2:]),
            'memory': ";".join(line['memory']).split(' ')[0],
            'storage': ";".join(line['storage']).split(' ')[0],
            'graphics': ";".join(line['graphics']),
            'price': line['price']
        }
        clean.append(row)
        
len(clean)

104

In [69]:
with open('macbook_pro_refurb.csv', 'w') as f:
    writer = csv.DictWriter(f, clean[0].keys())
    writer.writeheader()
    writer.writerows(clean)