# Email Validation
- Please use this notebook to complete assignment two of the data engineering take home challenge.
- The goal is to validate email addresses for all companies with `HIGH` buying intent.
- Please sign up for a free account at AbstractAPI and use their API for email validation (https://docs.abstractapi.com/email-validation, the free account covers 100 email verifications, which should be plenty).
- Your final output should be 2-column table (`email`,`is_email_valid`) and it should not take you longer than 1h to get there.

In [26]:
import duckdb
con = duckdb.connect("../data/db_src.duckdb")
import requests
import json
import time


In [4]:
validation_candidates = con.sql(
    """
    select
        co.email
    from crm.contacts as co
    left join crm.companies as c
        on co.company_id = c.id
    left join crm.customer_buying_intent as bi
        on bi.company_id = c.id
    where bi.buying_intent = 'HIGH'
    """
).to_df()


In [16]:
email_list = validation_candidates.email.to_list()
email_list

['alvarado.alvarado@becker-moore.net',
 'love.love@becker-moore.net',
 'holloway.holloway@crawford.com',
 'gillespie.gillespie@pena-schmidt.com',
 'buck.buck@malone.com',
 'lopez.lopez@malone.com',
 'tucker.tucker@waters.com',
 'lamb.lamb@waters.com',
 'white.white@cummings.net',
 'rodriguez.rodriguez@collins.com',
 'williams.williams@brock.com']

In [28]:
# Please use the space below to express your email validation logic in python and write it back into the CRM database.
url = "https://emailvalidation.abstractapi.com/v1/?api_key=bc50abf0b3d7474fa781b22befb11a20&email=edin.kciku94@gmail.com"
# print(url.split("email="))
base_url = url.split("email=")[0] + "email="

def get_valid_email(email, base_url):
    whole_url = base_url + email
    response = requests.get(whole_url)
    assert response.status_code == 200, f"{response.status_code}"
    time.sleep(5)
    json_str = response.content.decode('utf-8')
    data = json.loads(json_str)
    if data["deliverability"] == "DELIVERABLE": 
        return True
    else: 
        return False
   

is_email_valid = [get_valid_email(email, base_url) for email in email_list]



In [33]:
validation_candidates["is_email_valid"] = is_email_valid

In [35]:
con.sql("create table crm.valid_emails as from validation_candidates")


In [37]:
con.close()

In [None]:
# Backup: In case you get stuck above, you can run this cell to import a list of validated email addresses from pre-made csv file
con.sql("create table crm.valid_emails as from read_csv('data/valid_emails.csv')")
