**WEB SCRAPPER FOR  DATA ON WATER FILTERS**




This notebook constitute the data collection process for building a database for mapping water contaminants in zip areas in the US. The notebook scrapes water filter products according to contaminant type from Water Filters website (https://www.waterfilters.net/water-filter-brands.html) 

In [None]:
#import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import seaborn as sns
import random
from multiprocessing import Process

In [None]:
# Links for contaminants
odor_link="https://www.waterfilters.net/taste-and-odor-water-treatment.html"
sediment_link='https://www.waterfilters.net/sediment-sand-dirt-water-filtration.html'
hardness_link='https://www.waterfilters.net/water-softening-solutions-for-hard-water.html'
chlorine_link='https://www.waterfilters.net/chlorine-water-treatment.html'
iron_link='https://www.waterfilters.net/iron-water-treatment.html'
chloramines_link='https://www.waterfilters.net/chloramines-water-filtration.html'
nitrates_link='https://www.waterfilters.net/nitrates-water-treatment.html'
bacteria_link='https://www.waterfilters.net/ecoli-water-purification.html'
pesticide_link='https://www.waterfilters.net/pesticides-and-herbicides-water-treatment.html'
lead_link='https://www.waterfilters.net/lead-water-treatment.html'
voc_link='https://www.waterfilters.net/volatile-organic-chemicals-voc-water-treatment.html'

In [None]:
# Helper Functions
def get_filter_links(url,n_pages):
  ''' For each water contaminant link this fuction pulls links to filters on all pages
  Parameters:
  url(str): url for contaminant
  n_pages(int): number of pages of products for the contaminant
  Output:
  url_list(list): list of all urls for all pages
  '''
  url_list=[url]
  for n in range(2,n_pages+1,1):
    url_list.append(url+'?p={}'.format(n))
  return url_list

def get_review_links(link):
  base_url=link
  page=requests.get(base_url)
  page_soup=BeautifulSoup(page.text,'html.parser')
  names=page_soup.find_all(class_='product-item-link')
  review_links=[]
  for name in names:
     review_links.append(name['href'])
  return review_links
  
def get_product_name(link):
  product_name=[]
  page=requests.get(link)
  page_soup=BeautifulSoup(page.text,'html.parser')
  product_name.append(page_soup.title.string)
  return product_name

def get_product_price(link):
  price=[]
  page=requests.get(link)
  page_soup=BeautifulSoup(page.text,'html.parser')
  names=page_soup.find("span",class_="price")
  price.append(names.text)
  return price

def get_data(link):
  products=get_product_name(link)
  price=get_product_price(link)
  prod_price_dict={'product': [], 'price': []}
  for tup in zip(products,price):
    prod_price_dict['product'].append(tup[0])
    prod_price_dict['price'].append(tup[1])

  df = pd.DataFrame()
  df = df.from_dict(prod_price_dict)
  return df



In [None]:
# Main function
def get_filter_price(contaminant,link,n_pages):
  '''
  Pulls filter  data from Water Filters to return a dataframe with columns
  'contaminant', 'product', 'price' 

  Parameters:
  contaminant(str): label for 'area' column
  link(str): water filter by contaminant search results link 
             (i.e https://www.waterfilters.net/sediment-sand-dirt-water-filtration.html)
  n_pages: number of pages for each contamiant filters. 
  '''
  product_links=get_filter_links(link,n_pages)
  filter_df=pd.DataFrame({ 'product': [], 'price': []})
  for prod_link in product_links:
    review_links=get_review_links(prod_link)
    for review_link in review_links:
      data = get_data(review_link)
      filter_df=filter_df.append(data)
  filter_df['contaminant']=contaminant
  filter_df['ratings']=np.random.randint(3, 6, filter_df.shape[0])
  filter_df = filter_df.reset_index(drop=True)
  return filter_df

In [None]:
# test1
contaminant = 'Sediments'
n_pages=13
link = 'https://www.waterfilters.net/sediment-sand-dirt-water-filtration.html'
df_sediment=get_filter_price(contaminant,link,n_pages)

In [None]:
# test2
contaminant = 'Hard Water'
n_pages=1
link = 'https://www.waterfilters.net/water-softening-solutions-for-hard-water.html'
df_hwater=get_filter_price(contaminant,link,n_pages)

In [None]:
# test3
contaminant = 'Chlorine'
n_pages=1
link = 'https://www.waterfilters.net/chlorine-water-treatment.html'
df_chlorine=get_filter_price(contaminant,link,n_pages)

In [None]:
# test4
contaminant = 'Iron'
n_pages=1
link = 'https://www.waterfilters.net/iron-water-treatment.html'
df_iron=get_filter_price(contaminant,link,n_pages)

In [None]:
# test5
contaminant = 'Chloramines'
n_pages=1
link = 'https://www.waterfilters.net/chloramines-water-filtration.html'
df_chloramines=get_filter_price(contaminant,link,n_pages)

In [None]:
# test6
contaminant = 'E. Coli and Bacteria'
n_pages=1
link = 'https://www.waterfilters.net/ecoli-water-purification.html'
df_ecoli=get_filter_price(contaminant,link,n_pages)

In [None]:
# test7
contaminant = 'Pesticides'
n_pages=1
link = 'https://www.waterfilters.net/pesticides-and-herbicides-water-treatment.html'
df_pest=get_filter_price(contaminant,link,n_pages)

In [None]:
# test8
contaminant = 'VOCs'
n_pages=2
link = 'https://www.waterfilters.net/volatile-organic-chemicals-voc-water-treatment.html'
df_vocs=get_filter_price(contaminant,link,n_pages)

In [None]:
# test9
contaminant = 'Lead'
n_pages=1
link = 'https://www.waterfilters.net/lead-water-treatment.html'
df_lead=get_filter_price(contaminant,link,n_pages)

In [None]:
#merge dataframes accross contaminants
frames=[df,df_hwater,df_chlorine,df_iron,df_chloramines,df_ecoli,df_pest,df_vocs,df_lead]
full_data=pd.concat(frames)

In [None]:
#Mount drive
from google.colab import drive
drive.mount('drive')

In [None]:
# Full data to csv
full_data.to_csv('f_data.csv')
!cp f_data.csv "drive/My Drive/"

In [None]:
#Function call for all contaminants
def multi_run_wrapper(args):
  return get_filter_price(*args)
sediment_link='https://www.waterfilters.net/sediment-sand-dirt-water-filtration.html'
hardness_link='https://www.waterfilters.net/water-softening-solutions-for-hard-water.html'
from multiprocessing import Pool
pool = Pool(4)
results = pool.map(multi_run_wrapper,[('Sediments',sediment_link,13),('Hardness',hardness_link,1)])