# Web Scraping

In this notebook I will extract data regarding mens shoes from 5 different retailers sites and store the realted information in a local database. For further details surrounding the project please see the readme.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib2
from bs4 import BeautifulSoup
from __future__ import division
%matplotlib inline
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import unicodedata
from sqlalchemy import create_engine
import datetime as dt
%load_ext sql

In [2]:
## Create a connection to my local database.
engine=create_engine('postgresql://localhost:5432/capstone_project')
c=engine.connect()
conn=c.connection

In [3]:
## This modifies the request that urllib sends to the websites when extracting information, as some websites will 
## block any programs that do not fit the profile of a normal web browser.
from urllib import FancyURLopener
class MyOpener(FancyURLopener):
    version="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36\
                (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36"
myopener = MyOpener()

I am going to extract the following information from each of the sites:
- Shoe Brand
- Shoe Description
- Shoe Price
- Main Image URL
- Alternative Image URL

I will attempt to scrape every single mens shoes of each of the 5 websites, intially I will create a function customised to each of the sites, and then I will run theses function for the desired page count, convert the data to dataframes and then store them on a local database that has been set up in Postico.

In [4]:
##MATCHES FASION
def Matches_Scrape(pages):
    Brand_List=[]
    Description_List=[]
    Price_List=[]
    Image_URL_List=[]
    Image_URL_List_Alt=[]
    for a in range(1,pages):
        url=''.join(['http://www.matchesfashion.com/mens/shop/shoes?page=',str(a),'&noOfRecordsPerPage=240&sort='])
        page = myopener.open(url)
        soup = BeautifulSoup(page)
        
        ## Brand Data
        Brand_Data=soup.find_all('div',{'class':'lister__item__title'})
        for x in Brand_Data:
            Brand_List.append(x.text)
            
        ## Description Data
        Description_Data=soup.find_all('div',{'class':'lister__item__details'})
        for x in Description_Data:
            Description_List.append(x.text)
        
        ## Price Data
        Price_Data=soup.find_all('div',{'class':'lister__item__price'})
        for x in Price_Data:
            s=x.text.strip('\n')
            Price_List.append(unicodedata.normalize('NFKD', s).encode('ascii','ignore'))   
        
        ## Image Data
        ##URL number changes 1-2 for alt image
        Image_Data=soup.find_all('img',{'class':'lazy'})
        for x in Image_Data:
            s=''.join(['http:',x['data-original']])
            Image_URL_List.append(s)
            s2=s.replace('_1_','_2_')
            Image_URL_List_Alt.append(s2)
            
        time.sleep(5)
            
    return pd.DataFrame({'brand':Brand_List,
                        'description':Description_List,
                        'price':Price_List,
                        'image_url':Image_URL_List,
                        'image_url_alt':Image_URL_List_Alt,
                        'retailer':'Matches_Fashion',
                        'date':dt.datetime.today().strftime("%Y/%m/%d")})

In [5]:
##NET-A-PORTER
def NAP_Scrape(pages):
    Brand_List=[]
    Description_List=[]
    Price_List=[]
    Image_URL_List=[]
    Image_URL_List_Alt=[]
    for a in range(1,pages):
        url=''.join(['https://www.mrporter.com/en-gb/mens/shoes?pn=',str(a)])
        browser=webdriver.Firefox()
        browser.get(url)
        time.sleep(1)

        elem = browser.find_element_by_tag_name("body")
        no_of_pagedowns = 20

        while no_of_pagedowns:
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.2)
            no_of_pagedowns-=1

        page_source=browser.page_source
        soup = BeautifulSoup(page_source)
        
        ## Brand Data
        Brand_Data=soup.find_all('span',{'class':'pl-products-item__text pl-products-item__text--brand pl-products-item__text--upper'})
        for x in Brand_Data:
            Brand_List.append(x.text)
            
        ## Description Data
        Description_Data=soup.find_all('span',{'class':'pl-products-item__text pl-products-item__text--name'})
        for x in Description_Data:
            Description_List.append(x.text)        
        
        ## Price Data
        Price_Data=soup.find_all('span',{'class':'pl-products-item__text pl-products-item__text--price'})
        for x in Price_Data:
            s=x.text.strip('\n')
            s=s.replace(',','')
            Price_List.append(unicodedata.normalize('NFKD', s).encode('ascii','ignore'))   
        
        ## Image Data
        ##URL number changes fr-e2 for alt image
        Image_Data=soup.find_all('div',{'class':'pl-products-item__img pl-products-item__spacing'})
        for x in Image_Data:
            s=''.join(['http:',x.img['src']])
            Image_URL_List.append(s)
            s2=s.replace('_fr_','_e2_')
            Image_URL_List_Alt.append(s2)
            
        browser.quit()
        time.sleep(5)
            
    return pd.DataFrame({'brand':Brand_List,
                        'description':Description_List,
                        'price':Price_List,
                        'image_url':Image_URL_List,
                        'image_url_alt':Image_URL_List_Alt,
                        'retailer':'Net_A_Porter',
                        'date':dt.datetime.today().strftime("%Y/%m/%d")})

In [6]:
##FARFETCH
def Farfetch_Scrape(pages):
    Brand_List=[]
    Description_List=[]
    Price_List=[]
    Image_URL_List=[]
    Image_URL_List_Alt=[]
    
    for a in range(1,pages):
        url=''.join(['https://www.farfetch.com/uk/shopping/men/shoes-2/items.aspx?ffref=hd_snav&view=180&page=',str(a)])
        page = myopener.open(url)
        soup = BeautifulSoup(page)
        
        ##Brand Data
        Brand_Data=soup.find_all('h5',{'class':'listing-item-content-brand'})
        for x in Brand_Data:
            Brand_List.append(x.text)
            
        ## Description Data
        Description_Data=soup.find_all('p',{'class':'listing-item-content-description'})
        for x in Description_Data:
            Description_List.append(x.text)        
        
        ## Price Data
        Price_Data=soup.find_all('span',{'class':'listing-item-content-price'})
        for x in Price_Data:
            s=x.text.strip('\n')
            s=s.replace(',','')
            Price_List.append(unicodedata.normalize('NFKD', s).encode('ascii','ignore'))   
        
        ## Image Data
        Image_Data=soup.find_all('img',{'itemprop':'image'})
        for x in Image_Data:
            Image_URL_List.append(x['data-img'])
            Image_URL_List_Alt.append(x['data-img-alt'])
    
    time.sleep(5)
                        
    return pd.DataFrame({'brand':Brand_List,
                        'description':Description_List,
                        'price':Price_List,
                        'image_url':Image_URL_List,
                        'image_url_alt':Image_URL_List_Alt,
                        'retailer':'Farfetch',
                        'date':dt.datetime.today().strftime("%Y/%m/%d")})


Interestingly Harrods has all of the shoes on the sight on one very long page, I therefore do not need to pass in a page number variable.

In [7]:
##HARRODS
def Harrods_Scrape():
    Brand_List=[]
    Description_List=[]
    Price_List=[]
    Image_URL_List=[]
    Image_URL_List_Alt=[]
    
    url='http://www.harrods.com/shoes/men-all-shoes?sort=0&viewall=yes'
    page = myopener.open(url)
    soup = BeautifulSoup(page)
    
    
    ## First Row Data
    First_Row_Data=soup.find_all('ul',{'class':'products_row clearfix top '})
    for y in First_Row_Data:
        
        ## Element Data
        Element_Data= y.find_all('li')
        for x in Element_Data:
            
            ## Brand Data
            Brand_List.append(x.h3.span.text)
            
            ## Description Data
            Description_List.append(x.h3('span',{'class':'product-name'})[0].text)
            
            ## Price Data
            s=x('span',{'class':'price_all plp_price'})[0].text
            Price_List.append(unicodedata.normalize('NFKD', s).encode('ascii','ignore'))
            
            ## Image Data
            Image_URL_List.append(x.find_all('img')[1]['src'])
            Image_URL_List_Alt.append(x.find_all('img')[1]['data-hover'])
    
    ## Sections Data
    Sections_Data=soup.find_all('ul',{'class':'products_row'})
    
    for y in Sections_Data:
        ## Element Data
        Element_Data= y.find_all('li')
        for x in Element_Data:
            
            ## Brand Data
            Brand_List.append(x.h3.span.text)
            
            ## Description Data
            Description_List.append(x.h3('span',{'class':'product-name'})[0].text)
            
            ## Price Data
            s=x('span',{'class':'price_all plp_price'})[0].text
            Price_List.append(unicodedata.normalize('NFKD', s).encode('ascii','ignore'))
            
            ## Image Data
            Image_URL_List.append(x.find_all('img')[1]['src'])
            Image_URL_List_Alt.append(x.find_all('img')[1]['data-hover'])
    
    time.sleep(5)
            
    return pd.DataFrame({'brand':Brand_List,
                        'description':Description_List,
                        'price':Price_List,
                        'image_url':Image_URL_List,
                        'image_url_alt':Image_URL_List_Alt,
                        'retailer':'Harrods',
                        'date':dt.datetime.today().strftime("%Y/%m/%d")})

In [27]:
##SAKSFIFTHAVENUE - specifc URLs (0,180,360,540,720,900,1080,1260,1440,1620,1800)
def Saks_Scrape(pages):
    Brand_List=[]
    Description_List=[]
    Price_List=[]
    Image_URL_List=[]
    Image_URL_List_Alt=[]
    Image_URL_List_Alt1=[]
    Image_URL_List_Alt2=[]
    
    
    for p in pages:
        url=''.join(['http://www.saksfifthavenue.com/Men/Shoes/shop/_/N-52flst/Ne-6lvnb5?FOLDER%3C%3Efolder_id=2534374306418205&Nao=',str(p)])
        page = myopener.open(url)
        soup = BeautifulSoup(page)
        
        ## Brand Data
        Brand_Data=soup.find_all('span',{'class':'product-designer-name'})
        for x in Brand_Data:
            Brand_List.append(x.text)
            
        ## Description Data
        Description_Data=soup.find_all('p',{'class':'product-description'})
        for x in Description_Data:
            Description_List.append(x.text)        
        
        ## Price Data
        Price_Data=soup.find_all('span',{'class':'product-price'})
        for x in Price_Data:
            s=x.text.strip('\n')
            s=unicodedata.normalize('NFKD', s).encode('ascii','ignore')
            s= ''.join(c for c in s if c not in ['\n','\t',' ','G','B','P','W','a','s'])
            Price_List.append(s)  
        
        ## Image Data
        Image_Data=soup.find_all('div',{'class':'image-container-large'})
        for x in Image_Data:
            Temp= x.img['params'].split(',')
            Image_URL_List.append(Temp[1])     
            Image_URL_List_Alt.append(Temp[2])
                
        time.sleep(5)
                        
    return pd.DataFrame({'brand':Brand_List,
                        'description':Description_List,
                        'price':Price_List,
                        'image_url':Image_URL_List,
                        'image_url_alt':Image_URL_List_Alt,
                        'retailer':'Saks_Fifth_Avenue',
                        'date':dt.datetime.today().strftime("%Y/%m/%d")})

In [9]:
## Scrape the data from 5 pages on the Matches Fashion website.
Matches_Df=Matches_Scrape(5)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))


In [11]:
## Store the data scrapped from Matches website to the local database.
Matches_Df.to_sql('mens_shoes',engine,if_exists='append',index=False)

In [12]:
## Scrape shoes from 17 pages on the Mr. Porter website.
NAP_Df=NAP_Scrape(17)

'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'
'NoneType' object has no attribute 'path'


In [13]:
## Store the data scrapped from Mr. Porter website to the local database.
NAP_Df.to_sql('mens_shoes',engine,if_exists='append',index=False)

In [14]:
## Scrape shoes from 17 pages on the Farfetch website.
Farfetch_Df=Farfetch_Scrape(41)

In [15]:
## Store the data scrapped from Farfetch website to the local database.
Farfetch_Df.to_sql('mens_shoes',engine,if_exists='append',index=False)

In [16]:
## Scrape shoes from 17 pages on the Harrods website.
Harrods_Df=Harrods_Scrape()

In [17]:
## Store the data scrapped from Harrods website to the local database.
Harrods_Df.to_sql('mens_shoes',engine,if_exists='append',index=False)

In [31]:
## For the saks fifth avenue site, specific urls are required, I will do this in two sets incase there are any issuse
## I wont lose all of the scrapped data.
Pagination_List=range(0,1021,60)
Pagination_List2=range(1080,1561,60)

In [33]:
## Scrape the data from Saks website (complete x2 with the two lists)
Saks_Df=Saks_Scrape(Pagination_List2)

In [34]:
## Store the data scrapped from Saks website to the local database.
Saks_Df.to_sql('mens_shoes',engine,if_exists='append',index=False)

I have also added the locations of the details on some other ecommerce websites below in the case I would like to expand my dataset.

In [29]:
##ASOS
url='http://www.asos.com/men/shoes-boots-trainers/cat/?cid=4209&pge=0&pgesize=204'
Brand_Data=soup.find_all('div',{'class':'name-fade'})
#Description_Data=soup.find_all('p',{'class':'product-description'})
Price_Data=soup.find_all('div',{'class':'price-wrap price-current'})

In [44]:
##BOOHOO
url='http://www.boohoo.com/restofworld/shoes/icat/mens-footwear#esp_hitsperpage=80'
Brand_Data=soup.find_all('h3',{'class':'prod-name'})
#Description_Data=soup.find_all('p',{'class':'product-description'})
Price_Data=soup.find_all('div',{'class':'price-wrap price-current'})