# Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import logging
import pandas as pd
import csv
# from sqlalchemy import create_engine
# import sqlite3

# Getting economic news data from forexfactory.com

- https://www.forexfactory.com/

In [2]:
forcal = []

def setLogger():
    logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    filename='logs_file',
                    filemode='w')
    console = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

def getEconomicCalendar(startlink,endlink):

    # write to console current status
    logging.info("Scraping data for link: {}".format(startlink))

    # get the page and make the soup
    baseURL = "https://www.forexfactory.com/"
    r = requests.get(baseURL + startlink)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    # get and parse table data, ignoring details and graph
    table = soup.find("table", class_="calendar__table")

    # do not use the ".calendar__row--grey" css selector (reserved for historical data)
    trs = table.select("tr.calendar__row.calendar_row")
    fields = ["date","time","currency","impact","event","actual","forecast","previous"]

    # some rows do not have a date (cells merged)
    curr_year = startlink[-4:]
    curr_date = ""
    curr_time = ""
    for tr in trs:
        dict = {}

        # fields may mess up sometimes, see Tue Sep 25 2:45AM French Consumer Spending
        # in that case we append to errors.csv the date time where the error is
        try:
            for field in fields:
                data = tr.select("td.calendar__cell.calendar__{}.{}".format(field,field))[0]
                # print(data)
                if field=="date" and data.text.strip()!="":
                    curr_date = data.text.strip()
                elif field=="time" and data.text.strip()!="":
                    # time is sometimes "All Day" or "Day X" (eg. WEF Annual Meetings)
                    if data.text.strip().find("Day")!=-1:
                        curr_time = "12:00am"
                    else:
                        curr_time = data.text.strip()
                elif field=="currency":
                    currency = data.text.strip()
                elif field=="impact":
                    # when impact says "Non-Economic" on mouseover, the relevant
                    # class name is "Holiday", thus we do not use the classname
                    impact = data.find("span")["title"]
                elif field=="event":
                    event = data.text.strip()
                elif field=="actual":
                    actual = data.text.strip()
                elif field=="forecast":
                    forecast = data.text.strip()
                elif field=="previous":
                    previous = data.text.strip()

            date = datetime.datetime.strptime(",".join([curr_year,curr_date,curr_time]),"%Y,%a%b %d,%I:%M%p")
            # date = datetime.datetime.strptime(",".join([curr_year,curr_date,curr_time]),"%Y,%a%b %d,%I:%M%p")
            # date = datetime.datetime.strptime(",".join([curr_year,curr_date]),"%Y,%a%b")
            # time = datetime.datetime.strptime(curr_time, "%d,%I:%M%p")

            dict["Date"] = date.strftime("%Y-%m-%d %H:%M:%S")
            dict["Currency"] = currency
            dict["Impact"] = impact
            dict["Event"] = event
            dict["Actual"] = actual
            dict["Forecast"] = forecast
            dict["Previous"] = previous


            forcal.append(dict)
            # forcal.append(",".join([str(dt),currency,impact,event,actual,forecast,previous]))


        except:
            with open("errors.csv","a") as f:
                csv.writer(f).writerow([curr_year,curr_date,curr_time])

    # exit recursion when last available link has reached
    if startlink==endlink:
        logging.info("Successfully retrieved data")
        return

    # get the link for the next week and follow
    follow = soup.select("a.calendar__pagination.calendar__pagination--next.next")
    follow = follow[0]["href"]
    getEconomicCalendar(follow,endlink)

Original idea
 - https://gist.github.com/pohzipohzi/ad7942fc5545675022c1f31123e64c0c

# Initialize function

In [3]:
setLogger()
getEconomicCalendar("calendar?day=mar01.2020","calendar?day=apr25.2020")

2020-04-27 19:56:16,658 - INFO - Scraping data for link: calendar?day=mar01.2020
2020-04-27 19:56:17,722 - INFO - Scraping data for link: calendar?day=mar2.2020
2020-04-27 19:56:18,669 - INFO - Scraping data for link: calendar?day=mar3.2020
2020-04-27 19:56:19,690 - INFO - Scraping data for link: calendar?day=mar4.2020
2020-04-27 19:56:20,661 - INFO - Scraping data for link: calendar?day=mar5.2020
2020-04-27 19:56:21,636 - INFO - Scraping data for link: calendar?day=mar6.2020
2020-04-27 19:56:22,655 - INFO - Scraping data for link: calendar?day=mar7.2020
2020-04-27 19:56:23,569 - INFO - Scraping data for link: calendar?day=mar8.2020
2020-04-27 19:56:24,460 - INFO - Scraping data for link: calendar?day=mar9.2020
2020-04-27 19:56:25,423 - INFO - Scraping data for link: calendar?day=mar10.2020
2020-04-27 19:56:26,338 - INFO - Scraping data for link: calendar?day=mar11.2020
2020-04-27 19:56:27,396 - INFO - Scraping data for link: calendar?day=mar12.2020
2020-04-27 19:56:28,526 - INFO - Scr

# Creating dataframe

In [4]:
df = pd.DataFrame(forcal)
df

Unnamed: 0,Date,Currency,Impact,Event,Actual,Forecast,Previous
0,2020-03-01 17:30:00,AUD,Low Impact Expected,AIG Manufacturing Index,44.3,,45.4
1,2020-03-01 17:45:00,NZD,Low Impact Expected,Overseas Trade Index q/q,2.6%,0.8%,1.7%
2,2020-03-01 19:50:00,JPY,Low Impact Expected,Capital Spending q/y,-3.5%,-2.5%,7.1%
3,2020-03-01 20:00:00,AUD,Low Impact Expected,MI Inflation Gauge m/m,-0.1%,,0.3%
4,2020-03-01 20:30:00,AUD,Medium Impact Expected,Company Operating Profits q/q,-3.5%,-1.2%,-0.6%
...,...,...,...,...,...,...,...
746,2020-04-24 08:30:00,USD,Medium Impact Expected,Durable Goods Orders m/m,-14.4%,-12.0%,1.1%
747,2020-04-24 08:57:00,EUR,Low Impact Expected,Belgian NBB Business Climate,-36.1,-21.0,-10.9
748,2020-04-24 10:00:00,USD,Low Impact Expected,Revised UoM Consumer Sentiment,71.8,67.8,71.0
749,2020-04-24 10:00:00,USD,Low Impact Expected,Revised UoM Inflation Expectations,2.1%,,2.1%


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751 entries, 0 to 750
Data columns (total 7 columns):
Date        751 non-null object
Currency    751 non-null object
Impact      751 non-null object
Event       751 non-null object
Actual      751 non-null object
Forecast    751 non-null object
Previous    751 non-null object
dtypes: object(7)
memory usage: 41.2+ KB


In [6]:
import numpy as np

# Creating surprises column

- Dropping missing values rows

In [7]:
df.isnull().sum()

Date        0
Currency    0
Impact      0
Event       0
Actual      0
Forecast    0
Previous    0
dtype: int64

In [8]:
def missing_values_to_nan(column_name):
    return df[column_name].replace('', np.nan, inplace=True)

missing_values_to_nan('Forecast')
missing_values_to_nan('Actual')
missing_values_to_nan('Previous')

In [9]:
df.isnull().sum()

Date          0
Currency      0
Impact        0
Event         0
Actual      170
Forecast    296
Previous    170
dtype: int64

In [10]:
def drop_nan(column_name):
    return df.dropna(subset=[column_name], inplace=True)

drop_nan('Forecast')

In [11]:
df.isnull().sum()


Date        0
Currency    0
Impact      0
Event       0
Actual      0
Forecast    0
Previous    0
dtype: int64

- Checking data types

In [12]:
df.dtypes

Date        object
Currency    object
Impact      object
Event       object
Actual      object
Forecast    object
Previous    object
dtype: object

- Converting from object to float

In [13]:
df['Forecast'].head(20)

1      0.8%
2     -2.5%
4     -1.2%
6      47.6
7      46.1
9      48.9
10     47.9
11     48.0
12     49.7
13     47.8
14     49.1
15     51.9
16     0.2%
17      68K
18     5.9B
20     50.8
21     50.5
22     0.6%
23     51.2
24     3.0%
Name: Forecast, dtype: object

In [14]:
df['Forecast'] = df.Forecast.str.replace('%', '').str.replace('K', '').str.replace('B', '').str.replace('M', '').str.replace('T', '').str.replace('-', '').astype(float)
df['Actual'] = df.Actual.str.replace('%', '').str.replace('K', '').str.replace('B', '').str.replace('M', '').str.replace('T', '').str.replace('-', '').astype(float)
df['Previous'] = df.Previous.str.replace('%', '').str.replace('K', '').str.replace('B', '').str.replace('M', '').str.replace('T', '').str.replace('-', '').astype(float)

In [15]:
df['Forecast'].head(20)

1      0.8
2      2.5
4      1.2
6     47.6
7     46.1
9     48.9
10    47.9
11    48.0
12    49.7
13    47.8
14    49.1
15    51.9
16     0.2
17    68.0
18     5.9
20    50.8
21    50.5
22     0.6
23    51.2
24     3.0
Name: Forecast, dtype: float64

In [17]:
df['Date'] =  pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S.%f')

In [28]:
# df['Day'] = pd.DatetimeIndex(df['Date']).day
# df

Unnamed: 0,Date,Currency,Impact,Event,Actual,Forecast,Previous,Day
1,2020-03-01 17:45:00,NZD,Low Impact Expected,Overseas Trade Index q/q,2.6,0.8,1.7,1
2,2020-03-01 19:50:00,JPY,Low Impact Expected,Capital Spending q/y,3.5,2.5,7.1,1
4,2020-03-01 20:30:00,AUD,Medium Impact Expected,Company Operating Profits q/q,3.5,1.2,0.6,1
6,2020-03-01 20:30:00,JPY,Low Impact Expected,Final Manufacturing PMI,47.8,47.6,47.6,1
7,2020-03-01 21:45:00,CNY,High Impact Expected,Caixin Manufacturing PMI,40.3,46.1,51.1,1
...,...,...,...,...,...,...,...,...
744,2020-04-24 04:00:00,EUR,High Impact Expected,German ifo Business Climate,74.3,79.8,85.9,24
745,2020-04-24 08:30:00,USD,High Impact Expected,Core Durable Goods Orders m/m,0.2,6.1,0.7,24
746,2020-04-24 08:30:00,USD,Medium Impact Expected,Durable Goods Orders m/m,14.4,12.0,1.1,24
747,2020-04-24 08:57:00,EUR,Low Impact Expected,Belgian NBB Business Climate,36.1,21.0,10.9,24


In [29]:
df.dtypes

Date        datetime64[ns]
Currency            object
Impact              object
Event               object
Actual             float64
Forecast           float64
Previous           float64
Day                  int64
dtype: object

In [39]:
def filtration(column_name1, criteria1, column_name2, criteria2):
    return df[df[column_name1].str.contains(criteria1) & df[column_name2].str.contains(criteria2)]

filtration("Event", "Final Manufacturing PMI", "Currency", "EUR")

Unnamed: 0,Date,Currency,Impact,Event,Actual,Forecast,Previous,Day
12,2020-03-02 04:50:00,EUR,Low Impact Expected,French Final Manufacturing PMI,49.8,49.7,49.7,2
13,2020-03-02 04:55:00,EUR,Low Impact Expected,German Final Manufacturing PMI,48.0,47.8,47.8,2
14,2020-03-02 05:00:00,EUR,Low Impact Expected,Final Manufacturing PMI,49.2,49.1,49.1,2
447,2020-04-01 03:50:00,EUR,Low Impact Expected,French Final Manufacturing PMI,43.2,42.9,42.9,1
448,2020-04-01 03:55:00,EUR,Low Impact Expected,German Final Manufacturing PMI,45.4,45.6,45.7,1
449,2020-04-01 04:00:00,EUR,Low Impact Expected,Final Manufacturing PMI,44.5,44.7,44.8,1


# Next step import stock index prices...