# Web scraping project for laptop hard drives made by Western Digital on an American retailer NewEgg

## Importing the libraries that are required

In [1084]:
import requests
import bs4
import lxml
import re
import pandas as pd
import csv

## Creating base_url

In [959]:
base_url = 'https://www.newegg.com/p/pl?PageSize=36&N=100167524%2050001306&page={}'

In [967]:
base_url.format('')

'https://www.newegg.com/p/pl?PageSize=36&N=100167524%2050001306&page='

## Saving res cell

In [1055]:
res = requests.get(base_url.format(3))

In [1056]:
soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [1057]:
len(soup.select('.item-info'))

25

In [1060]:
string = soup.select('.item-info')[15].getText()

In [1061]:
string

'(266)Western Digital Scorpio Black wd5000bpkt 500GB 7200 RPM 16MB Cache SATA 3.0Gb/s 2.5" Internal Notebook Hard Drive Bare DriveHeight (maximum): 9.5mmWidth (maximum): 69.75mmLength (maximum): 100.3mmParts: 5 years limitedModel #: wd5000bpktReturn Policy: View Return Policy'

## It's time to make re and check if everything works for retreiving data

## Applying regular expression for every required piece of data

In [1067]:
name = re.findall('\s(WD......|WD........)[IR]', string)
name

[]

In [1063]:
capacity = re.findall('\dTB|\d\d\dGB', string)[0]
capacity

'500GB'

In [1064]:
speed = re.findall('\d\d\d\d\s\D\D\D|\d\d\d\d\d\s\D\D\D',string)[0]
speed

'7200 RPM'

In [1065]:
size = re.findall('\s\d\.\d[\s|\W]',string)[0]
size

' 2.5"'

In [1066]:
string1 = soup.select('.price-current')[0].getText()
string1

'$65.99\xa0(12 Offers)–'

In [987]:
money = re.findall('\A......', string1)[0]
money

'$75.00'

## So looks like everythiing works. It's time to scrape through all the pages

In [1073]:
hard_disk_data = []
for n in range(1,4):
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)

    soup = bs4.BeautifulSoup(res.text, 'lxml')
    for i in range(0, len(soup.select('.item-info'))):
        first_data = []
        disk_info = soup.select('.item-info')[i].getText()
        disk_price = soup.select('.price-current')[i].getText()
        
        try: 
            name = re.findall('\s(WD......|WD........)[IR]', disk_info)[0]
        except:
            pass
        try:
            capacity = re.findall('\dTB|\d\d\dGB', disk_info)[0]
        except:
            pass
        try:
            speed = re.findall('\d\d\d\d\s\D\D\D|\d\d\d\d\d\s\D\D\D',disk_info)[0]
        except:
            pass
        
        try:
            size = re.findall('\s\d\.\d[\s|\W]',disk_info)[0]
        except:
            pass
        
        try:
            money = re.findall('\A......', disk_price)[0]
        except:
            pass
        
        first_data.append(name)
        first_data.append(capacity)
        first_data.append(speed)
        first_data.append(size)
        first_data.append(money)
        
        hard_disk_data.append(first_data)

## Lets check what we got

In [1077]:
hard_disk_data

[['WD10SPSX', '1TB', '7200 RPM', ' 2.5 ', '$65.99'],
 ['WD10SPZX', '1TB', '5400 RPM', ' 2.5"', '$48.90'],
 ['WD20SPZX', '2TB', '5400 RPM', ' 2.5"', '$67.99'],
 ['WD10JFCX', '1TB', '5400 RPM', ' 2.5 ', '$86.55'],
 ['WD1000CHTZ', '1TB', '10000 RPM', ' 2.5"', '$57.99'],
 ['WD5000LPCX', '500GB', '5400 RPM', ' 2.5"', '$31.99'],
 ['WD5000LPSX', '500GB', '7200 RPM', ' 2.5"', '$49.00'],
 ['WD10JUCT', '1TB', '5400 RPM', ' 2.5-', '$39.99'],
 ['WD5000LPLX', '500GB', '7200 RPM', ' 2.5"', '$46.34'],
 ['WD2500LPLX', '250GB', '7200 RPM', ' 2.5"', '$83.70'],
 ['WD7500BPKX', '750GB', '7200 RPM', ' 2.5 ', '$61.00'],
 ['WD3001BKHG', '300GB', '10000 RPM', ' 2.5"', '$78.99'],
 ['WD9001BKHG', '900GB', '10000 RPM', ' 2.5"', '$76.95'],
 ['WD1600BEVT', '160GB', '5400 RPM', ' 2.5"', '$39.99'],
 ['WD5000BPKX', '500GB', '7200 RPM', ' 2.5"', '$53.00'],
 ['WD5000BPKX', '500GB', '7200 RPM', ' 2.5"', '$92.99'],
 ['WD5000BHTZ', '500GB', '10000 RPM', ' 2.5"', '$105.0'],
 ['WD10JPLX', '1TB', '7200 RPM', ' 2.5 ', '$95.81

In [1076]:
len(hard_disk_data)

97

## Looks like we scraped through everything and colected all the required information
## So now lets convert our data into a data frame and take a look at it

In [1083]:
df = pd.DataFrame(hard_disk_data, columns=['Model', 'Capacity', 'Speed', 'Size', 'Price'])
df

Unnamed: 0,Model,Capacity,Speed,Size,Price
0,WD10SPSX,1TB,7200 RPM,2.5,$65.99
1,WD10SPZX,1TB,5400 RPM,"2.5""",$48.90
2,WD20SPZX,2TB,5400 RPM,"2.5""",$67.99
3,WD10JFCX,1TB,5400 RPM,2.5,$86.55
4,WD1000CHTZ,1TB,10000 RPM,"2.5""",$57.99
...,...,...,...,...,...
92,WD1600BEVE,500GB,5400 RPM,"2.5""",$199.0
93,WD5000BHTZ,500GB,5400 RPM,2.5,$139.0
94,WD5000BHTZ,160GB,7200 RPM,"2.5""",$31.00
95,WD7500KEVT,750GB,5200 RPM,"2.5""",$89.00


## Lets save our data with pandas

In [1105]:
df.to_csv('NewEggScraping.csv')

In [1109]:
df1 = pd.read_csv('NewEggScraping.csv')
df1.pop('Unnamed: 0')
df1

Unnamed: 0,Model,Capacity,Speed,Size,Price
0,WD10SPSX,1TB,7200 RPM,2.5,$65.99
1,WD10SPZX,1TB,5400 RPM,"2.5""",$48.90
2,WD20SPZX,2TB,5400 RPM,"2.5""",$67.99
3,WD10JFCX,1TB,5400 RPM,2.5,$86.55
4,WD1000CHTZ,1TB,10000 RPM,"2.5""",$57.99
...,...,...,...,...,...
92,WD1600BEVE,500GB,5400 RPM,"2.5""",$199.0
93,WD5000BHTZ,500GB,5400 RPM,2.5,$139.0
94,WD5000BHTZ,160GB,7200 RPM,"2.5""",$31.00
95,WD7500KEVT,750GB,5200 RPM,"2.5""",$89.00
