[Reference](https://towardsdev.com/data-engineering-project-web-scraping-into-a-mysql-database-e93efaffde73)

In [1]:
!pip install pandasql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26784 sha256=e3e08e1a7c383d48fccb31d73ac8641f9e6a6ec1bdabdd83e848f3c67f57e88b
  Stored in directory: /root/.cache/pip/wheels/5c/4b/ec/41f4e116c8053c3654e2c2a47c62b4fca34cc67ef7b55deb7f
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [2]:
!pip install pymysql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymysql
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.0.2


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandasql as ps
import pymysql
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.dates as md
sns.set_style('white')

In [4]:
# Generating a range of daily dates from 2018 to 2021
dates = pd.date_range(start='1/1/2018', end = '1/1/2021').to_pydatetime().tolist()

In [5]:
dates

[datetime.datetime(2018, 1, 1, 0, 0),
 datetime.datetime(2018, 1, 2, 0, 0),
 datetime.datetime(2018, 1, 3, 0, 0),
 datetime.datetime(2018, 1, 4, 0, 0),
 datetime.datetime(2018, 1, 5, 0, 0),
 datetime.datetime(2018, 1, 6, 0, 0),
 datetime.datetime(2018, 1, 7, 0, 0),
 datetime.datetime(2018, 1, 8, 0, 0),
 datetime.datetime(2018, 1, 9, 0, 0),
 datetime.datetime(2018, 1, 10, 0, 0),
 datetime.datetime(2018, 1, 11, 0, 0),
 datetime.datetime(2018, 1, 12, 0, 0),
 datetime.datetime(2018, 1, 13, 0, 0),
 datetime.datetime(2018, 1, 14, 0, 0),
 datetime.datetime(2018, 1, 15, 0, 0),
 datetime.datetime(2018, 1, 16, 0, 0),
 datetime.datetime(2018, 1, 17, 0, 0),
 datetime.datetime(2018, 1, 18, 0, 0),
 datetime.datetime(2018, 1, 19, 0, 0),
 datetime.datetime(2018, 1, 20, 0, 0),
 datetime.datetime(2018, 1, 21, 0, 0),
 datetime.datetime(2018, 1, 22, 0, 0),
 datetime.datetime(2018, 1, 23, 0, 0),
 datetime.datetime(2018, 1, 24, 0, 0),
 datetime.datetime(2018, 1, 25, 0, 0),
 datetime.datetime(2018, 1, 26, 0,

In [6]:
# Generating a dictionary where the keys are the days and the values are the currencies rates with a EUR base
data = {}
for date in range(0,len(dates)):
    url = 'http://api.exchangeratesapi.io/v1/'+dates[date].strftime("%Y-%m-%d")+'?access_key=*********************'
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html)
    data[dates[date]] = soup

In [7]:
backupData = data.copy()

In [8]:
EUR_USD_rates = {}

for day in range(len(dates)):
    # Extracting the HTML Data
    d = data[dates[day]].text
    
    # Extracting the Date from the Data
    start_date_location = d.find('"date":"') + len('"date":"')
    end_date_location = d.find('","rates"')
    date = d[start_date_location:end_date_location]
    
    # Extracting the Rate from the Data
    start_rate_location = d.find('"USD":') + len('"USD":')
    end_rate_location = d.find(',"UYU"')
    rate = float(d[start_rate_location:end_rate_location])
    
    # Creating a Dictionary where the keys are the dates and the values are the EUR/USD rates
    EUR_USD_rates[date] = rate

In [9]:
# Creating a Panda's Dataframe out of the rates dictionary
df = pd.DataFrame.from_dict(EUR_USD_rates, orient='index', columns=['Rates'])

# Load data into a CSV file
df.to_csv('EUR_USD_Rates.csv')

In [10]:
# Import pymysql module
import pymysql

# Connect to the database
connection = pymysql.connect(host='localhost',
                             user='root',
                             password='****',
                             db='forexwarehouse')
                             
cursor = connection.cursor()

In [11]:
sql = "CREATE TABLE rates(day VARCHAR(50) NOT NULL, rate FLOAT NOT NULL)"

# Executing Query
cursor.execute(sql)

# Commit the Query
connection.commit()

In [12]:
dates = df.iloc[:,0].values
rates = df.iloc[:,1].values
data_tuple = tuple(zip(dates, rates))

In [13]:
for data in range(len(data_tuple)):
    
    # Create a new record
    sql = "INSERT INTO rates (`day`, `rate`) VALUES (%s, %s)"
    
    # Execute the query
    cursor.execute(sql,data_tuple[data])
    
    
# Commit the transaction
connection.commit()

In [14]:
# Select All Rows from the Database
sql = "select * from rates"
cursor.execute(sql)

result = cursor.fetchall()
# Print the first 10 Rows
for i in result[0:10]:
    print(i)

In [15]:
# Loading data from the CSV file
df = pd.read_csv('EUR_USD_Rates.csv')

# Renaming the date column
df.rename({'Unnamed: 0':'day'},axis = 1, inplace=True)

# Generating simple descriptive statistics
df.describe()

In [16]:
fig, ax = plt.subplots(figsize = [22,8])

# specify the position of the major ticks at the beginning of the week
ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday = 1))
# specify the format of the labels as 'year-month-day'
ax.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d'))
# (optional) rotate by 90° the labels in order to improve their spacing
plt.setp(ax.xaxis.get_majorticklabels(), rotation = 90)

# specify the position of the minor ticks at each day
ax.xaxis.set_minor_locator(md.DayLocator(interval = 100))



ax.tick_params(axis = 'x', which = 'major', length = 10)
ax.tick_params(axis = 'x', which = 'minor', length = 5)

sns.lineplot(data=df.date, x = 'index', y = 'Rates', ax=ax)
plt.xlabel('Date',size = 16)
plt.ylabel('Rates',size = 16)
plt.title('Trending Rates - EUR/USD - By Day',size = 22)
plt.show()

In [17]:
# Creating a Month column
df['month'] = pd.DatetimeIndex(df.index).month

# Creating a Year column
df['year'] = pd.DatetimeIndex(df.index).year

# Sample Data
df.head()

In [18]:
sns.catplot(data=df.reset_index(), x = 'month', y = 'Rates', kind = 'box')
plt.xlabel('Month',size = 16)
plt.ylabel('Rates',size = 16)
plt.title('Trending Rates - EUR/USD - By Month',size = 22)
plt.show()

In [19]:
sns.catplot(data=df.reset_index(), x = 'month', y = 'Rates', kind = 'point')
plt.xlabel('Month',size = 16)
plt.ylabel('Rates',size = 16)
plt.title('Trending Rates - EUR/USD - By Month',size = 22)
plt.show()

In [20]:
sns.catplot(data=df.reset_index(), x = 'month', y = 'Rates', kind = 'point',col = 'year')
plt.show()

In [21]:
fig, ax = plt.subplots(figsize = [18,8])
sns.kdeplot(data = df, x = 'month', y = 'Rates')
plt.show()

In [22]:
def sql(query):
    return ps.sqldf(query)

query1 = '''
select Rates 
from df
where year = 2018
'''

query2 = '''
select Rates 
from df
where year = 2019
'''

query3 = '''
select Rates 
from df
where year = 2020
'''

df_2018 = sql(query1)
df_2019 = sql(query2)
df_2020 = sql(query3)

In [23]:
fig, ax = plt.subplots(figsize = [18,8])
sns.kdeplot(data = df_2018,x = 'Rates',shade = True, label='2018')
sns.kdeplot(data = df_2019,x = 'Rates',shade = True, label='2019')
sns.kdeplot(data = df_2020,x = 'Rates',shade = True, label='2020')
plt.legend()
plt.show()