[Reference](https://towardsdatascience.com/web-scraping-to-extract-contact-information-part-1-mailing-lists-854e8a8844d2)

In [4]:
!pip install scrapy

Collecting scrapy
[?25l  Downloading https://files.pythonhosted.org/packages/3a/16/3c7c37caf25f91aa21db194655515718c2a15f704f9f5c59a194f5c83db0/Scrapy-2.4.1-py2.py3-none-any.whl (239kB)
[K     |█▍                              | 10kB 16.7MB/s eta 0:00:01[K     |██▊                             | 20kB 20.8MB/s eta 0:00:01[K     |████                            | 30kB 11.8MB/s eta 0:00:01[K     |█████▌                          | 40kB 8.6MB/s eta 0:00:01[K     |██████▉                         | 51kB 4.4MB/s eta 0:00:01[K     |████████▏                       | 61kB 4.7MB/s eta 0:00:01[K     |█████████▋                      | 71kB 5.2MB/s eta 0:00:01[K     |███████████                     | 81kB 5.2MB/s eta 0:00:01[K     |████████████▎                   | 92kB 5.7MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 5.9MB/s eta 0:00:01[K     |███████████████                 | 112kB 5.9MB/s eta 0:00:01[K     |████████████████▍               | 122kB 5.9MB/s et

In [5]:
import logging
import os
import pandas as pd
import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from googlesearch import search

logging.getLogger('scrapy').propagate = False

# 1. Extract websites from google with googlesearch

In [6]:
def get_urls(tag, n, language):
    urls = [url for url in search(tag, stop=n, lang=language)][:n]
    return urls

In [7]:
get_urls('movie rating', 5 , 'en')

['https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system',
 'https://www.filmratings.com/',
 'https://www.filmratings.com/RatingsGuide',
 'https://www.filmratings.com/History',
 'https://www.filmratings.com/News']

# 2. Make a regex expression to extract emails

```
mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)
```

# 3. Scrape websites using a Scrapy Spider

In [10]:
class MailSpider(scrapy.Spider):
    
    name = 'email'
    
    def parse(self, response):
        
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))
        
        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link) 
            
    def parse_link(self, response):
        
        for word in self.reject:
            if word in str(response.url):
                return
            
        html_text = str(response.text)        
        
        mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)

        dic = {'email': mail_list, 'link': str(response.url)}
        df = pd.DataFrame(dic)
        
        df.to_csv(self.path, mode='a', header=False)
        df.to_csv(self.path, mode='a', header=False)

In [12]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
process.crawl(MailSpider, start_urls=google_urls, path=path, reject=reject)
process.start()

# 4. Save those emails in a CSV file

In [13]:
def ask_user(question):
    response = input(question + ' y/n' + '\n')
    if response == 'y':
        return True
    else:
        return False
        
def create_file(path):
    response = False
    if os.path.exists(path):
        response = ask_user('File already exists, replace?')
        if response == False: return 
    
    with open(path, 'wb') as file: 
        file.close()

# 5. Put everything together

In [14]:
def get_info(tag, n, language, path, reject=[]):
    
    create_file(path)
    df = pd.DataFrame(columns=['email', 'link'], index=[0])
    df.to_csv(path, mode='w', header=True)
    
    print('Collecting Google urls...')
    google_urls = get_urls(tag, n, language)
    
    print('Searching for emails...')
    process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
    process.crawl(MailSpider, start_urls=google_urls, path=path, reject=reject)
    process.start()
    
    print('Cleaning emails...')
    df = pd.read_csv(path, index_col=0)
    df.columns = ['email', 'link']
    df = df.drop_duplicates(subset='email')
    df = df.reset_index(drop=True)
    df.to_csv(path, mode='w', header=True)
    
    return df

In [15]:
bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki']
df = get_info('mastering studio london', 300, 'pt', 'studios.csv', reject=bad_words)

Collecting Google urls...
Searching for emails...


Gave up retrying <GET http://ww.joncohenmusic.com> (failed 3 times): DNS lookup failed: no results for hostname lookup: ww.joncohenmusic.com.
Gave up retrying <GET https://www.https//onlinemastering.org.uk/testimonials/> (failed 3 times): DNS lookup failed: no results for hostname lookup: www.https.
Gave up retrying <GET https://www.https//onlinemastering.org.uk/cart/> (failed 3 times): DNS lookup failed: no results for hostname lookup: www.https.
Error downloading <GET http://ww.joncohenmusic.com>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 1416, in _inlineCallbacks
    result = result.throwExceptionIntoGenerator(g)
  File "/usr/local/lib/python3.6/dist-packages/twisted/python/failure.py", line 512, in throwExceptionIntoGenerator
    return g.throw(self.type, self.value, self.tb)
  File "/usr/local/lib/python3.6/dist-packages/scrapy/core/downloader/middleware.py", line 45, in process_request
    return (yield downl

Cleaning emails...


In [16]:
df.head()

Unnamed: 0,email,link
0,,
1,info@londonmasteringstudio.co,https://www.londonmasteringstudio.co.uk/
2,mastering@thisismetropolis.com,https://www.thisismetropolis.com/product/maste...
3,bookings@miloco.co,https://milocostudios.com/contact/?feedback
4,info@umusic.com,https://www.universalmusic.com/careers/
