# Web scraping project for laptop hard drives made by Western Digital on an American retailer NewEgg

## Installing some packages for the project

In [21]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


## Importing the libraries that are required

In [1]:
import requests
import bs4
import lxml
import re
import pandas as pd
import csv

## Creating base_url

In [2]:
base_url = 'https://www.newegg.com/p/pl?PageSize=36&N=100167524%2050001306&page={}'

In [3]:
base_url.format('')

'https://www.newegg.com/p/pl?PageSize=36&N=100167524%2050001306&page='

## Saving res cell

In [4]:
res = requests.get(base_url.format(3))

In [5]:
soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [6]:
len(soup.select('.item-info'))

18

In [29]:
string = soup.select('.item-info')[10].getText()

In [30]:
string

'(2)Refurbished WD WD VelociRaptor WD4500BLHX 450GB 10000 RPM 32MB Cache SATA 6.0Gb/s 2.5" Internal Enterprise Hard Drive Bare DriveModel #: WD4500BLHXItem #: 9SIB85DJA16516Return Policy: View Return Policy'

## It's time to make re and check if everything works for retreiving data

## Applying regular expression for every required piece of data

In [9]:
name = re.findall('\s(WD......|WD........)[IR]', string)
name

['WD5000LPVX']

In [10]:
capacity = re.findall('\dTB|\d\d\dGB', string)[0]
capacity

'500GB'

In [11]:
speed = re.findall('\d\d\d\d\s\D\D\D|\d\d\d\d\d\s\D\D\D',string)[0]
speed

'5400 RPM'

In [12]:
size = re.findall('\s\d\.\d[\s|\W]',string)[0]
size

' 2.5"'

In [13]:
string1 = soup.select('.price-current')[0].getText()
string1

'$143.18\xa0–'

In [14]:
money = re.findall('\A......', string1)[0]
money

'$143.1'

## So looks like everythiing works. It's time to scrape through all the pages

In [15]:
hard_disk_data = []
for n in range(1,4):
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)

    soup = bs4.BeautifulSoup(res.text, 'lxml')
    for i in range(0, len(soup.select('.item-info'))):
        first_data = []
        disk_info = soup.select('.item-info')[i].getText()
        disk_price = soup.select('.price-current')[i].getText()
        
        try: 
            name = re.findall('\s(WD......|WD........)[IR]', disk_info)[0]
        except:
            pass
        try:
            capacity = re.findall('\dTB|\d\d\dGB', disk_info)[0]
        except:
            pass
        try:
            speed = re.findall('\d\d\d\d\s\D\D\D|\d\d\d\d\d\s\D\D\D',disk_info)[0]
        except:
            pass
        
        try:
            size = re.findall('\s\d\.\d[\s|\W]',disk_info)[0]
        except:
            pass
        
        try:
            money = re.findall('\A......', disk_price)[0]
        except:
            pass
        
        first_data.append(name)
        first_data.append(capacity)
        first_data.append(speed)
        first_data.append(size)
        first_data.append(money)
        
        hard_disk_data.append(first_data)

## Lets check what we got

In [16]:
hard_disk_data

[['WD20SPZX', '2TB', '5400 RPM', ' 2.5"', '$73.99'],
 ['WD5000LPSX', '500GB', '7200 RPM', ' 2.5"', '$38.99'],
 ['WD10SPSX', '1TB', '7200 RPM', ' 2.5 ', '$73.99'],
 ['WD5000BHTZ', '500GB', '10000 RPM', ' 2.5"', '$98.34'],
 ['WD5000LPCX', '500GB', '5400 RPM', ' 2.5"', '$34.50'],
 ['WD1000CHTZ', '1TB', '10000 RPM', ' 2.5"', '$54.95'],
 ['WD10SPZX', '1TB', '5400 RPM', ' 2.5"', '$59.38'],
 ['WD7500BPVX', '750GB', '5400 RPM', ' 2.5 ', '$61.84'],
 ['WD7500BPVX', '750GB', '5400 RPM', ' 2.5"', '$98.76'],
 ['WD5000LPLX', '500GB', '7200 RPM', ' 2.5"', '$40.00'],
 ['WD10JUCT', '1TB', '5400 RPM', ' 2.5-', '$50.00'],
 ['WD7500BPVT', '750GB', '5400 RPM', ' 2.5"', '$56.99'],
 ['WD2500BPVT', '250GB', '5400 RPM', ' 2.5"', '$54.00'],
 ['WD9001BKHG', '900GB', '10000 RPM', ' 2.5"', '$94.99'],
 ['WD10SPCX', '1TB', '5400 RPM', ' 2.5"', '$74.75'],
 ['WD2500LPLX', '250GB', '7200 RPM', ' 2.5"', '$45.00'],
 ['WD2500LPLX', '640GB', '5400 RPM', ' 2.5"', '$69.95'],
 ['WD2500LPLX', '1TB', '5400 RPM', ' 6.0 ', '$33.5

In [17]:
len(hard_disk_data)

90

## Looks like we scraped through everything and colected all the required information
## So now lets convert our data into a data frame and take a look at it

In [18]:
df = pd.DataFrame(hard_disk_data, columns=['Model', 'Capacity', 'Speed', 'Size', 'Price'])
df

Unnamed: 0,Model,Capacity,Speed,Size,Price
0,WD20SPZX,2TB,5400 RPM,"2.5""",$73.99
1,WD5000LPSX,500GB,7200 RPM,"2.5""",$38.99
2,WD10SPSX,1TB,7200 RPM,2.5,$73.99
3,WD5000BHTZ,500GB,10000 RPM,"2.5""",$98.34
4,WD5000LPCX,500GB,5400 RPM,"2.5""",$34.50
...,...,...,...,...,...
85,WD15NPVT,5TB,5400 RPM,"2.5""",$450.0
86,WD20NPVT,5TB,5400 RPM,"2.5""",$321.4
87,WD5000LPVX,500GB,5400 RPM,"2.5""",$36.09
88,WD3200LPLX,320GB,7200 RPM,"2.5""",$62.33


## Lets save our data with pandas

In [19]:
df.to_csv('NewEggScraping.csv')

In [20]:
df1 = pd.read_csv('NewEggScraping.csv')
df1.pop('Unnamed: 0')
df1

Unnamed: 0,Model,Capacity,Speed,Size,Price
0,WD20SPZX,2TB,5400 RPM,"2.5""",$73.99
1,WD5000LPSX,500GB,7200 RPM,"2.5""",$38.99
2,WD10SPSX,1TB,7200 RPM,2.5,$73.99
3,WD5000BHTZ,500GB,10000 RPM,"2.5""",$98.34
4,WD5000LPCX,500GB,5400 RPM,"2.5""",$34.50
...,...,...,...,...,...
85,WD15NPVT,5TB,5400 RPM,"2.5""",$450.0
86,WD20NPVT,5TB,5400 RPM,"2.5""",$321.4
87,WD5000LPVX,500GB,5400 RPM,"2.5""",$36.09
88,WD3200LPLX,320GB,7200 RPM,"2.5""",$62.33
