In [28]:
import json
import requests
import pandas as pd
import pymysql.cursors
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

# CPA Scraper

Scrape CPA data from https://mystaticwebsite-3.s3.amazonaws.com/index.html and insert table into database.

In [29]:
data_url = "https://mystaticwebsite-3.s3.amazonaws.com/index.html"
data = requests.get(data_url).text
soup = BeautifulSoup(data, "html.parser")
web_title = soup.h1.text
month, year = web_title.split()[-2:]
table_title = f"{month[:3].lower()}_{year[-2:]}_cpa_avgs"
df = pd.read_html(data)[0]
print(table_title)
df


oct_21_cpa_avgs


Unnamed: 0,Region,Average CPA by Region
0,South East,£10.00
1,North East,£10.00
2,North,£8.00
3,South,£10.00


In [30]:
# clean up columns for consistency

df["Region"] = df["Region"].map(lambda x: "UK " + x)
df["Average CPA by Region"] = df["Average CPA by Region"].str.slice(start=1).astype('float64')
df.rename(columns={"Region": "uk_region", "Average CPA by Region": "cpa_average"}, inplace=True)

In [32]:
with open("db_login.json", "r") as f:
    login = json.load(f)

conn = pymysql.connect(
    host=login['host'],
    user=login['user'],
    password=login['pass'],
    database=login['db_name'],
    cursorclass=pymysql.cursors.DictCursor,
    ssl={"fake_flag_to_enable_tls":True}
    # see link https://stackoverflow.com/questions/55617520/unable-to-make-tls-tcp-connection-to-remote-mysql-server-with-pymysql-other-too
)
engine = create_engine(f"mysql+pymysql://{login['user']}:{login['pass']}@{login['host']}/{login['db_name']}",
                        connect_args={"ssl":{"fake_flag_to_enable_tls":True}})

with conn:
    with conn.cursor() as cur:
        cur.execute(f"""
                    CREATE TABLE IF NOT EXISTS {table_title}(
                    uk_region VARCHAR(20) PRIMARY KEY,
                    cpa_average DOUBLE
        );""")
    conn.commit()
    df.to_sql(table_title, engine, if_exists='append', index=False)