#`Credicxo` Amazon scrapper [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MPAmJaJLXxE_iDLdDVdJoGu6NZ8xthZR?usp=sharing)




###1. Importing all the required libraries 

In [None]:
print('Installing mysql-connector library...')
!pip install mysql-connector

In [None]:
#import
import concurrent.futures
import json
import os
import requests
import time

from bs4 import BeautifulSoup as soup
import mysql.connector
import pandas as pd

###2. Creating the scrapper
    

*  Generate urls froom the csv by `urls_gen` function
*  Scrap the the data and store to `amazon_data` list
*  Dumps the `amazon_data` to `Amazon_Products.json`
*  And, dumps all the `amazon_data` to a mysql db




In [None]:
class CredicxoScrap:
  """ Data scrap from amazon sites
  """
  def __init__(self, csv_path):
    self.csv_path = csv_path

  def urls_gen(self):
    """urls generetor form the csv"""

    csv = pd.read_csv(self.csv_path) #csv_file

    asin = csv.Asin.values.tolist()  #extract asin from the csv
    country = csv.country.values.tolist()  ##extract country from the csv
    
    #saving teh urls
    all_urls = [f'https://www.amazon.{country}/dp/{asin}'\
                for asin, country in zip(asin,country)]
    print('Printing the first url form the list: {}'.format(all_urls[0]))
    return all_urls

  @staticmethod
  def html_gen(url):
    """generate html(html.parser) form the url"""

    r = requests.get(url,
                      headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) \
                      Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":\
                      "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1",\
                      "Connection":"close", "Upgrade-Insecure-Requests":"1"})
    
    #checking for 404 error
    if r.status_code==404:
      print(f'The {url} is unavilable. Skipping...')
    else:
      html = soup(r.content, 'html.parser') #html.parser
      
      return html
    
  
  def scrap_data(self, url):
    """scrap data from the url"""

    html = self.html_gen(url) #create html from the link

    product_row = {}
    product_row['Product Title'] = html.find('span', id='productTitle').text  #scrap product title
    product_row['Product Image URL'] = html.find('div', class_='imgTagWrapper').find('img')['src'] #scrap product image url
    product_row['Price of the Product'] = html.find('span', class_='a-price aok-align-center priceToPay').text[1:] #scrap product price
    product_row['Product Details'] = html.find('div', id='feature-bullets').text  #scrap product details

    return product_row  #append the product row to amazon data list
  


In [None]:
#csv_path
csv_path = '/content/Amazon Scraping - Sheet1.csv'

#class with csv_path
amazon = CredicxoScrap(csv_path)

#save all_urls
all_urls = amazon.urls_gen()

In [None]:
#scrap data from url
amazon_data = []

#scraped data to store to amazon_data with multithreading

start_time = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
  executor.map(amazon.scrap_data(), all_urls)

#for measuring the time for each 100 urls
end_time = time.perf_counter()
print("Each round of 100 urls is taking: "+str(round((start_time-end_time)/10, 2)))
    

###3. Createing Json file


In [None]:
def json_handler():
  """create json file from the list of dict"""
  currnet_path = os.getcwd()  #extract current path
  json_path = os.path.join(currnet_path, 'Amazon_Products.json')   # creating json path
  with open(json_path, 'w') as f:
    json.dump(amazon_data, f, indent=4)   # dumps the scraped data
  
  print(json_path)

json_handler()

###4. Storing the data to MySQL

In [None]:
#list object store to database
def mysql_handler(self,host="localhost",user="root",
                  password="root",database="amazonscrap"):
    
  """stroing data in mysql db from the list of dict"""

  db = mysql.connector.connect(host=host,user=user,
                              password=password,database=database) #connect to a databse
    
  cursor = db.cursor()  #db cursor

  cusrsor.excute("CREATE TABLE amazon (title varchar(100), img_url varchar(100), price int, details varchar(1000)) ")  #creating columns

  sql = "INSERT INTO amazon (title, img_url, price, details) VALUES (%s, %s, %s, %s)"  #sql


  try:
    cursor.executemany(sql, amazon_data)
    db.commit()  
  except:
    cursor.rollback() 

  print(cursor.rowcount, "record inserted.")

mysql_handler()