![autoscrape](https://user-images.githubusercontent.com/54896849/115991908-6fb41280-a5e8-11eb-8f21-3e8d013d0e40.jpeg)

```
AutoScraper is an automatic web scraping library to make scraping easy.
In this tutorial, I'll show how to use AutoScraper to scrape Amazon.in and Flipkart.com```

# Import Libraries

In [1]:
from autoscraper import AutoScraper
import pandas as pd

# Train - Build Scraper using Incremental Learning

In [2]:
# Suppose we want to make a price scraper to work with multiple websites. 
# I fed some sample data to the scraper for training. 

data = [
    # Amazon
   ('https://www.amazon.in/s?bbn=1389401031&rh=n%3A1389401031%2Cp_89%3ARedmi&dc&qid=1619345402&rnid=3837712031&ref=lp_1389401031_nr_p_89_0',
    ['Redmi Note 10 (Shadow Black, 4GB RAM, 64GB Storage)','₹11,999']
   ),
    
    ('https://www.amazon.in/s?i=computers&bbn=1375424031&rh=n%3A976392031%2Cn%3A1375424031%2Cp_89%3AMI%7CSamsung&dc&qid=1619345741&rnid=3837712031&ref=sr_nr_p_89_6',
     ['Mi Notebook 14 Intel core i3-10110U 10th Gen FHD Ultra Thin and Light Laptop (8GB/256GB SSD/Windows 10, Home/Intel UHD Graphics/Silver/1.5Kg), XMA1901-FI','₹37,999']
    ),
    
    # Flipkart
    ('https://www.flipkart.com/mobiles/mi~brand/pr?sid=tyy,4io&otracker=nmenu_sub_Electronics_0_Mi',
    ['Redmi 9 Prime (Mint Green, 64 GB)','₹9,999']
    ),
    
    ('https://www.flipkart.com/televisions/pr?sid=ckf%2Cczl&p%5B%5D=facets.brand%255B%255D%3DRealme&otracker=nmenu_sub_TVs%20%26%20Appliances_0_realme',
    ['realme 80 cm (32 inch) HD Ready LED Smart Android TV','₹14,999']
    )
   
]

scraper = AutoScraper()
for url, wanted_list in data:
    scraper.build(url=url, wanted_list=wanted_list, update=True)

# Test Scraper

In [3]:
for website in ['Amazon test','Flipkart test']:
    print(website)
    print('*'*20)
    url=input('Enter link : ')
    results=scraper.get_result_similar(url,grouped=True)
    print(results)
    print('#'*100)

Amazon test
********************
Enter link : https://www.amazon.in/s?i=computers&bbn=1375424031&rh=n%3A976392031%2Cn%3A1375424031%2Cp_89%3AHP&dc&qid=1619345854&rnid=3837712031&ref=sr_nr_p_89_7
{'rule_kfnn': ['HP 15 Entry Level 15.6-inch HD Laptop (AMD 3020e/4GB/1TB HDD/Windows 10 Home/Jet Black/1.74 Kg), 15s-gy0003AU', 'HP 14 (2021) Thin & Light 11th Gen Core i3 Laptop, 8 GB RAM, 256GB SSD, 14-inch FHD Screen, Windows 10, MS Office, Alexa Built-in (14s-dy2500TU)', 'HP 15 (2021) Thin & Light Ryzen 3-3250 Laptop, 8 GB RAM, 1TB HDD + 256GB SSD, 15-inch FHD Screen, Windows 10, MS Office (15s-gr0012AU)', 'HP 15 (2021) Thin & Light 11th Gen Core i3 Laptop, 8 GB RAM, 1TB HDD, 15.6-inch FHD Screen, Windows 10, MS Office (15s-dy3001TU)', 'HP 15 (2021) Thin & Light Ryzen 3-3250 Laptop, 8 GB RAM, 1TB HDD, 15-inch FHD Screen, Windows 10, MS Office (15s-gr0011AU)', 'HP 15 (2021) Thin & Light 11th Gen Core i5 Laptop, 8 GB RAM, 512GB SSD, 15.6-inch FHD Screen, Windows 10, MS Office, Built-in Alexa B

Enter link : https://www.flipkart.com/televisions/pr?sid=ckf%2Cczl&p%5B%5D=facets.availability%255B%255D%3DExclude%2BOut%2Bof%2BStock&otracker=categorytree&p%5B%5D=facets.serviceability%5B%5D%3Dtrue&p%5B%5D=facets.brand%255B%255D%3DThomson&otracker=nmenu_sub_TVs%20%26%20Appliances_0_Thomson
{'rule_kfnn': [], 'rule_ello': [], 'rule_n0sq': [], 'rule_ho3p': [], 'rule_arbd': [], 'rule_7gq0': [], 'rule_2wo4': [], 'rule_wkfj': [], 'rule_y85c': [], 'rule_049g': [], 'rule_sgu9': ['Thomson R9 60 cm (24 inch) HD Ready LED TV', 'Thomson 9A Series 80 cm (32 inch) HD Ready LED Smart Android TV', 'Thomson 9A Series 106 cm (42 inch) Full HD LED Smart Android TV', 'Thomson R9 80 cm (32 inch) HD Ready LED TV', 'Thomson 9A Series 80 cm (32 inch) HD Ready LED Smart Android TV with Bezel Less Display', 'Thomson OATHPRO Series 139 cm (55 inch) Ultra HD (4K) LED Smart Android TV with Dolby Digital Plus', 'Thomson 9A Series 108 cm (43 inch) Full HD LED Smart Android TV with Bezel Less Display', 'Thomson OATH

In [6]:
# We can see above that some rules are not relevant for our task.
# Select relevant rules only.

scraper.keep_rules(['rule_ello','rule_n0sq','rule_pvyz','rule_zi2p'])

In [7]:
# Testing the scraper again after keeping selected rules

results=scraper.get_result_similar(url,grouped=True)

d={}
for i,j in results.items():
    if(j!=[]):
        d[i]=j
        
        
df=pd.DataFrame(d)
df.columns=['Product','Price']
df.head()

Unnamed: 0,Product,Price
0,Thomson R9 60 cm (24 inch) HD Ready LED TV,"₹9,499"
1,Thomson 9A Series 80 cm (32 inch) HD Ready LED...,"₹13,999"
2,Thomson 9A Series 106 cm (42 inch) Full HD LED...,"₹20,999"
3,Thomson R9 80 cm (32 inch) HD Ready LED TV,"₹11,999"
4,Thomson 9A Series 80 cm (32 inch) HD Ready LED...,"₹14,490"


# Save Scraper

In [8]:
scraper.save('ecommerce-scrape')

In [None]:
# scraper.load('ecommerce-scrape')

# References

* https://pypi.org/project/autoscraper/
* https://github.com/krishnaik06/Autoscrapper