<a href="https://colab.research.google.com/github/Alantjee/papernest/blob/main/ScrapingEmails.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1- Loading libraries

In [None]:
import re
import time
from tokenize import Ignore
import requests
from bs4 import BeautifulSoup
import pandas as pd
import queue
import threading
from urllib.parse import urlsplit
import numpy


from google.colab import auth
from google.colab import files
import gspread
from google.auth import default
import gspread_dataframe as gd

## 2- Choice of the scope (italy, france, spain)

In [None]:
scope = input('Enter the scope of your analysis (italy, france, spain, uk): uk ')
print('The scope chosen is: ' + scope)

Enter the scope of your analysis (italy, france, spain, uk): france
The scope chosen is: france


## 3- Data loading URL

In [None]:
# In pd.read_csv: replace the link (in public sharing mode) with the link of your file with the websites to crawls (replace with export?format=csv&)
url_link = 'https://docs.google.com/spreadsheets/d/1OMGESeQfuDTmrbH0lamI4zURQtO0KzkBb8Wc3dDa_5U/export?format=csv&gid=0'

## 4- Functions  

In [None]:
class myThread (threading.Thread):
    def __init__(self, scraper, id, queue_lock, dataframe_lock):
      threading.Thread.__init__(self)
      self.scraper = scraper
      self.id = id
      self.queue_lock = queue_lock
      self.dataframe_lock = dataframe_lock
      self.alive = True
      self.regex_domain = '(\.[^\.]*)$'
      self.regex_words = '([a-zA-Z0-9_]+)[^a-zA-Z0-9_]'
      
    def run(self):
        
        while self.alive:
            
            url = self.next_link()
            
            if url != None:
            
                try:
                    path, base_url = self.split_url(url)
                    
                    text, dataframe_new_emails = self.scraper.crawl_url_address(url)
                    dataframe_new_emails = self.filter_emails(dataframe_new_emails)
                    print('{} emails found'.format(len(dataframe_new_emails)))
                    self.update_emails(dataframe_new_emails, base_url)
                    
                    links = self.scraper.crawl_url_link(text, base_url, path)
                    print('{} links found'.format(len(links)))
                    self.update_links(links)

                except:
                    pass
                
            
    def next_link(self):
        self.queue_lock.acquire()
        print(f"Queue Size: {self.scraper.links.qsize()}")
        if self.scraper.links.qsize() > 0:
            temp = self.scraper.links.get()
            print("Thread {} scraping : {}".format(self.id, temp[0]))
            self.queue_lock.release()
            return temp[0]
        else:
            self.alive = False
            return None
    
    def split_url(self, url):
        parts = urlsplit(url)
        base_url = "{0.scheme}://{0.netloc}".format(parts)
        if '/' in parts.path:
            path = url[:url.rfind('/')+1]
        else:
            path = url
        
        return path, base_url
    
    def filter_emails(self, dataframe):
        dataframe['filter'] = dataframe['mail'].apply(self.regex)
        return dataframe[dataframe['filter']].drop('filter', axis='columns', errors='ignore')
        
    def regex(self, mail):
        words = re.findall(self.regex_words, mail)
        domain = re.findall(self.regex_domain, mail)
        bool_word = False
        for val in words:
            if val in self.scraper.keywords_to_avoid:
                bool_word = True
        bool_domain = (domain in self.scraper.domains_to_avoid)
        bool_mail = (mail in self.scraper.mails_to_avoid)
        bool_length = (len(mail)<7)
        
        return not(bool_word or bool_domain or bool_mail or bool_length)
        
    def update_emails(self, dataframe_new_emails, base_url):
        
        if len(dataframe_new_emails)>0:
            dataframe_new_emails['base_url'] = base_url
            self.dataframe_lock.acquire()
            self.scraper.emails = pd.concat([self.scraper.emails, dataframe_new_emails], ignore_index=True).drop_duplicates(subset=['mail'])    
            self.scraper.emails = self.scraper.emails.sort_values(by='base_url').reset_index(drop=True)
            self.scraper.emails.to_csv('emails.csv', header=True)
            self.dataframe_lock.release()
        
    def update_links(self, links):
        
        if len(links)>0:
            self.queue_lock.acquire()
            for val in links:
                path, base_url = self.split_url(val)
                if base_url in list(self.scraper.seen_links.keys()):
                    if not(val in self.scraper.seen_links[base_url]) and len(self.scraper.seen_links[base_url])<100:
                        self.scraper.links.put([val, base_url])
                        self.scraper.seen_links[base_url].append(val)
                else:
                    self.scraper.links.put([val, base_url])
                    self.scraper.seen_links[base_url] = [val]
                    
                    for keyword in self.scraper.contact_keywords:
                        self.scraper.links.put([base_url + '/' + keyword, base_url])
                        self.scraper.seen_links[base_url].append(base_url + '\\' + keyword)
                    
            self.queue_lock.release()
    


class ScrapingEmails:
    def __init__(self, urls, scope, nbcrawlers = 10):
        
        self.nbcrawlers = nbcrawlers
        
        self.urls = urls
        self.seen_links = {}
        for val in self.urls:
            self.seen_links[val] = [val]
        self.links = queue.Queue()
        self.threads = []
        self.queue_lock = threading.Lock()
        self.dataframe_lock = threading.Lock()
        
        self.emails = pd.DataFrame()

        auth.authenticate_user()
        creds, _ = default()
        gc = gspread.authorize(creds)
        gc.create("emails")
        self.sheet = gc.open("emails").sheet1
        
        if scope == 'italy' or scope == 'Italy':
            self.to_avoid = pd.read_csv('https://docs.google.com/spreadsheets/d/1ouKXLiPILTSQDg0BSnsu7tJrIwQfjQKNggY06KQdEE8/export?format=csv&gid=939556409')
            self.contact_keywords = ['contatti.html' , 'dove-siamo-e-contatti/' , 'contatti/' , 'contact/' , 'contatti' , '#contatti' , 'contact' , '#!/contacts' , '#contact' , 'Contatti.aspx' , 'chi-siamo/', 'privacy-policy/' , '#!/privacy' , 'privacyPolicy' , 'cookie-policy/' , 'all-rights-reserved-copyright' , 'Cookies.aspx' , 'about/']
        elif scope == 'spain' or scope == 'Spain':
            self.to_avoid = pd.read_csv('https://docs.google.com/spreadsheets/d/1ouKXLiPILTSQDg0BSnsu7tJrIwQfjQKNggY06KQdEE8/export?format=csv&gid=825351815')
            self.contact_keywords = ['contatti.html' , 'dove-siamo-e-contatti/' , 'contatti/' , 'contact/' , 'contatti' , '#contatti' , 'contact' , '#!/contacts' , '#contact' , 'Contatti.aspx' , 'chi-siamo/', 'privacy-policy/' , '#!/privacy' , 'privacyPolicy' , 'cookie-policy/' , 'all-rights-reserved-copyright' , 'Cookies.aspx' , 'about/']
        elif scope == 'france' or scope == 'France':
            self.contact_keywords = ['mentions-legales/','contact/','informations-legales/','index.php/contact/' , 'mentions.html' , 'politique-de-confidentialite/' , 'contact.php/' , 'contact.htm/' , 'contacts.html/' , 'contact.html/' , 'cgv/']
            self.to_avoid = pd.read_csv('https://docs.google.com/spreadsheets/d/1ouKXLiPILTSQDg0BSnsu7tJrIwQfjQKNggY06KQdEE8/export?format=csv&gid=0')
        elif scope == 'uk' or scope == 'UK':
            self.contact_keywords = ['contatti.html' , 'dove-siamo-e-contatti/' , 'contatti/' , 'contact/' , 'contatti' , '#contatti' , 'contact' , '#!/contacts' , '#contact' , 'Contatti.aspx' , 'chi-siamo/', 'privacy-policy/' , '#!/privacy' , 'privacyPolicy' , 'cookie-policy/' , 'all-rights-reserved-copyright' , 'Cookies.aspx' , 'about/']
            self.to_avoid = pd.read_csv('https://docs.google.com/spreadsheets/d/1ouKXLiPILTSQDg0BSnsu7tJrIwQfjQKNggY06KQdEE8/export?format=csv&gid=1873485851')

        self.mails_to_avoid = self.to_avoid['mails_to_avoid'].dropna().tolist()
        self.keywords_to_avoid = self.to_avoid['keywords_to_avoid'].dropna().tolist()
        self.domains_to_avoid = self.to_avoid['domains_to_avoid'].dropna().tolist()
        
    def pre_run(self):
        self.filling_queue()
        self.creating_threads()
        
    def post_run(self):
        for thread in self.threads:
            thread.join()
        
        
    def creating_threads(self):
        for i in range(self.nbcrawlers):
            thread = myThread(self, i, self.queue_lock, self.dataframe_lock)
            self.threads.append(thread)
            thread.start()
            time.sleep(10)
            
    def filling_queue(self):
        for val in self.urls:
            self.links.put([val, val])
            
        
    def crawl_url_address(self, url):
        response = requests.get(url)
        new_emails = re.findall(r'[a-zA-Z0-9_.+-]+(?:@|\(a\)|\(at\)|\( at \)|\( a \)|\( @ \)|&#64;|\( arobase \)|\(arobase\)|\[a\]|\[ a \]|\[at\]|\[ at \])(?!domain|email)[a-zA-Z0-9_.+-]+(?:\.|\( point \)|\(point\)|\(dot\))(?!jpg|jpeg|png|gif|docx)[a-zA-Z]{1,4}', response.text, re.I)
        dataframe_new_emails = pd.DataFrame({'mail': new_emails}).drop_duplicates(subset=['mail'])
        return response.text, dataframe_new_emails
    
    def crawl_url_link(self, text, base_url, path):
        soup = BeautifulSoup(text, 'html.parser')
        links = []
        for anchor in soup.find_all("a"):
            
            if "href" in anchor.attrs:
              link = anchor.attrs["href"]
              
            else:
                link = ''
                
            if link.startswith('/'):
                link = base_url + link
                
            elif not link.startswith('http'):
                link = path + link
            links.append(link)
        links = numpy.random.choice(links, size=50).tolist()
        return links
    
ndd = pd.read_csv(url_link, on_bad_lines='skip', header=0)
url = ndd['URL'].apply(lambda x: "{0.scheme}://{0.netloc}".format(urlsplit(x))).unique().tolist()
scraper = ScrapingEmails(url, scope)
scraper.pre_run()
scraper.post_run()

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m

Queue Size: 3318
Thread 9 scraping : https://www.monde-diplomatique.fr/2019/04/BRISSAUD/59741
0 emails found
50 links found
Queue Size: 3318
Thread 5 scraping : https://www.monde-diplomatique.fr/publications
0 emails found
50 links found
Queue Size: 3342
Thread 4 scraping : https://www.monde-diplomatique.fr/int#Macédoine
50 links found
Queue Size: 3341
Thread 1 scraping : https://www.monde-diplomatique.fr/index.php/contact/ecouter/Podcast_diplo/2022-04-Endeweld_diplo.mp3
50 links found
Queue Size: 3354
Thread 0 scraping : https://www.monde-diplomatique.fr/diplo/apropos
0 emails found
50 links found
Queue Size: 3367
Thread 2 scraping : https://www.monde-diplomatique.fr/revues/foreignaffairs
3 emails found
0 emails found
50 links found
Queue Size: 3379
Thread 8 scraping : https://www.monde-diplomatique.fr/revues/newleftreview
0 emails found
50 links found0 emails found

Queue Size: 3391
Thread 3 

KeyboardInterrupt: ignored