# Web scraping project for laptop hard drives made by Western Digital on an American retailer NewEgg

## Installing some packages for the project

In [1]:
# library for processing XML and HTML in Python

In [41]:
pip install lxml

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [42]:
# a library that parses HMTL

In [43]:
pip install bs4

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Importing the libraries that are required

In [192]:
#importint all the libraries
import requests
import bs4
import lxml
import re
import pandas as pd
import csv

## Creating base_url

In [224]:
# after filtering all the disks that I want to scrape, I copy the url path and replace page number with {}
base_url = 'https://www.newegg.com/p/pl?N=100167524%2050001306&PageSize=36&page={}'

In [244]:
# format method is used to replace {} into something we want, in out case - number of pages.
base_url.format('')

'https://www.newegg.com/p/pl?N=100167524%2050001306&PageSize=36&page='

## Saving res cell

In [245]:
#requests.get() is a function from the requests library used to send an HTTP GET request. in this case the first page
res = requests.get(base_url.format(2))

In [246]:
# we create a new variable with a bs4 object, by passing the res.text and html.parser which is a built in bs4 html parser
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [247]:
# here we see how many parsed items returned after parsing
len(soup.select('.item-info'))

36

In [248]:
# We see the first parsed objecct turned into string with all the information that we need
string = soup.select('.item-info')[1].getText()

In [250]:
# The information of the very first item iteself
string

'(81)Used - Like New WD Blue WD3200LPVX 320GB 5400 RPM 8MB Cache SATA 6.0Gb/s 2.5" Internal Notebook Hard Drive Bare DriveAverage Latency: 5.5msHeight (maximum): 7mmWidth (maximum): 69.85mmLength (maximum): 100.20mmModel #: WD3200LPVXItem #: 9SIAAEEJY96325Return Policy: View Return Policy'

## It's time to make re and check if everything works for retreiving data

## Applying regular expression for every piece of data required

In [251]:
# .findall() is used to find all not overlapping matches of the pattern

In [252]:
# it retrieves a space+WD+6 any symbols or WD and 8 any symbols
name = re.findall('\s(WD.{6}|WD.{7}|WD.{8})[IR]', string)[0]
name

'WD3200LPVX'

In [253]:
# this pattern matches a digit and TB or 3 digits and GB
capacity = re.findall('\dTB|\d{2}GB|\d{3}GB', string)[0]
capacity

'320GB'

In [254]:
# this pattern matches 4 digists, a space and 3 non digit characters, or 5 digits, a space and 3 non Digit characters
speed = re.findall('\d{4}\s\D{3}|\d{5}\s\D{3}',string)[0]
speed

'5400 RPM'

In [255]:
# this pattern matches a space, a digit, a dot, a digit and then either a space or a non word carachter
size = re.findall('\s\d\.\d[\s|\W]',string)[0].strip()
size

'2.5"'

In [256]:
string1 = soup.select('.price-current')[0].getText()
string1

'$22.10\xa0–'

In [258]:
# \A asserts the beginning of the string and then 6 any characters
money = re.findall('\A.{6}', string1)[0]
money

'$22.10'

## So looks like everythiing works. It's time to scrape through all the pages

In [259]:
hard_disk_data = []
for n in range(1,4):
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)

    soup = bs4.BeautifulSoup(res.text, 'lxml')
    for i in range(0, len(soup.select('.item-info'))):
        first_data = []
        disk_info = soup.select('.item-info')[i].getText()
        disk_price = soup.select('.price-current')[i].getText()
        
        try: 
            name = re.findall('\s(WD.{6}|WD.{8})[IR]', disk_info)[0]
        except:
            pass
        try:
            capacity = re.findall('\dTB|\d{3}GB', disk_info)[0]
        except:
            pass
        try:
            speed = re.findall('\d{4}\s\D{3}|\d{5}\s\D{3}',disk_info)[0]
        except:
            pass
        
        try:
            size = re.findall('\s\d\.\d[\s|\W]',disk_info)[0].strip()
        except:
            pass
        
        try:
            money = re.findall('\A.{6}', disk_price)[0]
        except:
            pass
        
        hard_disk_data.append([name, capacity, speed, size, money])

## Lets check what we got

In [260]:
hard_disk_data

[['WD10SPSX', '1TB', '7200 RPM', '2.5', '$49.99'],
 ['WD20SPZX', '2TB', '5400 RPM', '2.5"', '$73.99'],
 ['WD5000LPSX', '500GB', '7200 RPM', '2.5"', '$29.99'],
 ['WD10SPZX', '1TB', '5400 RPM', '2.5"', '$51.34'],
 ['WD5000LPLX', '500GB', '7200 RPM', '2.5"', '$38.00'],
 ['WD5000LPCX', '500GB', '5400 RPM', '2.5"', '$33.99'],
 ['WD1000CHTZ', '1TB', '10000 RPM', '2.5"', '$54.95'],
 ['WD7500BPVX', '750GB', '5400 RPM', '2.5', '$65.20'],
 ['WD7500BPVX', '750GB', '5400 RPM', '2.5"', '$98.76'],
 ['WD10JUCT', '1TB', '5400 RPM', '2.5-', '$40.00'],
 ['WD5000BHTZ', '500GB', '10000 RPM', '2.5"', '$98.40'],
 ['WD7500BPVT', '750GB', '5400 RPM', '2.5"', '$56.99'],
 ['WD2500BPVT', '250GB', '5400 RPM', '2.5"', '$51.99'],
 ['WD10SPCX', '1TB', '5400 RPM', '2.5"', '$74.75'],
 ['WD10SPCX', '640GB', '5400 RPM', '2.5"', '$69.95'],
 ['WD2500LPLX', '250GB', '7200 RPM', '2.5"', '$45.00'],
 ['WD9001BKHG', '900GB', '10000 RPM', '2.5"', '$94.99'],
 ['WD9001BKHG', '1TB', '10000 RPM', '6.0', '$33.50'],
 ['WD3001BKHG', '

In [261]:
len(hard_disk_data)

88

## Looks like we scraped through everything and collected all the required information
## So now lets convert our data into a data frame and take a look at it

In [262]:
df = pd.DataFrame(hard_disk_data, columns=['Model', 'Capacity', 'Speed', 'Size', 'Price'])
df

Unnamed: 0,Model,Capacity,Speed,Size,Price
0,WD10SPSX,1TB,7200 RPM,2.5,$49.99
1,WD20SPZX,2TB,5400 RPM,"2.5""",$73.99
2,WD5000LPSX,500GB,7200 RPM,"2.5""",$29.99
3,WD10SPZX,1TB,5400 RPM,"2.5""",$51.34
4,WD5000LPLX,500GB,7200 RPM,"2.5""",$38.00
...,...,...,...,...,...
83,WD1600BVVT,160GB,5400 RPM,"2.5""",$218.5
84,WD10JPVT,160GB,5400 RPM,"2.5""",$128.5
85,WD5000LPVX,500GB,5400 RPM,"2.5""",$26.19
86,WD15NPVT,5TB,5400 RPM,"2.5""",$450.0


## Lets save our data with pandas

In [263]:
df.to_csv('NewEggScraping.csv')

In [264]:
df1 = pd.read_csv('NewEggScraping.csv')
df1.pop('Unnamed: 0')
df1

Unnamed: 0,Model,Capacity,Speed,Size,Price
0,WD10SPSX,1TB,7200 RPM,2.5,$49.99
1,WD20SPZX,2TB,5400 RPM,"2.5""",$73.99
2,WD5000LPSX,500GB,7200 RPM,"2.5""",$29.99
3,WD10SPZX,1TB,5400 RPM,"2.5""",$51.34
4,WD5000LPLX,500GB,7200 RPM,"2.5""",$38.00
...,...,...,...,...,...
83,WD1600BVVT,160GB,5400 RPM,"2.5""",$218.5
84,WD10JPVT,160GB,5400 RPM,"2.5""",$128.5
85,WD5000LPVX,500GB,5400 RPM,"2.5""",$26.19
86,WD15NPVT,5TB,5400 RPM,"2.5""",$450.0
