# **Purpose: Collect News Article titles about a currency pair and get its associated price movement (Up or Down) **

****

In [None]:
!pip install beautifulsoup4 requests

In [None]:
!pip install investpy

**Get the titles of relevant news articles from www.investing.com**

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def get_forex_news_five_years(currency_pair):

    headers = {"User-Agent": "Mozilla/5.0"}
    articles = []
    
    page_count = 0
    stop_flag = False
    article_titles = []
    article_dates = []
    while not stop_flag and page_count <= 10:
        if page_count == 0:
            url = 'https://www.investing.com/currencies/'+currency_pair+'-news'
        else:
            url = 'https://www.investing.com/currencies/'+currency_pair+'-news/'+str(page_count)
        response = requests.get(url, headers=headers)
        print('url:',url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            news_list = soup.find('ul', attrs={'data-test': 'news-list'})

            # Extract all <article> elements inside it
            articles = news_list.find_all('article')
            for article in articles:
                div_tag = article.find('div', class_='block w-full sm:flex-1')
                if div_tag:
                    # Extract the <a> tag inside the <div>
                    a_tag = div_tag.find('a', href=True)
                else:
                    print("coudln't find div tag of class block w-full sm:flex-1")
                    assert(False)
                if a_tag:
                    article_title = a_tag.get_text(strip=True)
                    
                else:
                    print("coudln't find a tag with href")
                    assert(False)

                time_tag = div_tag.find('time', datetime=True)

                if time_tag:
                    # Get the datetime attribute from the <time> tag
                    datetime_value = time_tag['datetime'].split()[0]
                    dt = datetime.strptime(datetime_value, '%Y-%m-%d').date()
                    current_date = datetime.now().date()
                    diff_in_years = current_date.year - dt.year
                    if diff_in_years == 5:
                        stop_flag = True
                    else:
                        article_titles.append(article_title)
                        article_dates.append(datetime_value)


                else:
                    print("No <time> tag with datetime attribute found.")
                    assert(False)

        else:
            print("couldn't get page info:",url)
            assert(False)

        page_count += 1

    df = pd.DataFrame({
    'Title': article_titles,
    'Date': article_dates
    })
    c1 = currency_pair.split('-')[0]
    c2 = currency_pair.split('-')[1]
    str_ = c1+'_'+c2
    df.to_csv('currency_pair_info'+str_+'.csv', index=False)
    print("saved csv")
    print("done")

currency_pair = 'gbp-usd'
get_forex_news_five_years(currency_pair)


In [73]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt

def build_dataset(price_csv, titles_csv, pred_horizon, dest_file):

    df_price = pd.read_csv(price_csv)
    df_titles = pd.read_csv(titles_csv)

    df_titles['Date'] = pd.to_datetime(df_titles['Date']).dt.date  # Converts to datetime.date
    df_price['Date'] = pd.to_datetime(df_price['Date']).dt.date

    date_format = "%Y-%m-%d"
    dates = df_price['Date'].tolist()
    titles_col = []
    label_col = []
    for datestr in dates[:-1]:
        today = pd.to_datetime(datestr, format=date_format).date()
        next_day_dt = today + datetime.timedelta(days=1)
        next_day = next_day_dt.strftime(date_format)
        df1 = df_titles[(today - df_titles['Date'] >= datetime.timedelta(days=0)) & (today - df_titles['Date'] < datetime.timedelta(days=pred_horizon))]
        relevant_titles = df_titles[(today - df_titles['Date'] >= datetime.timedelta(days=0)) & (today - df_titles['Date'] < datetime.timedelta(days=pred_horizon))]['Title'].tolist()
        today_price = df_price[df_price['Date'] == today]['Close'].tolist()[0]
        try:
            tomorrow_price = df_price[df_price['Date'] == next_day_dt]['Close'].tolist()[0]
        except:
            # case when next day is a Saturday
            next_day_dt = today + datetime.timedelta(days=3)
            next_day = next_day_dt.strftime(date_format)
            tomorrow_price = df_price[df_price['Date'] == next_day_dt]['Close'].tolist()[0]

        label = int(tomorrow_price > today_price)

        titles = "\n".join(relevant_titles)

        titles_col.append(titles)
        label_col.append(label)

    
    df = pd.DataFrame({
    'Titles': titles_col,
    'Label': label_col
    })

    df.to_csv(dest_file, index=False)

price_csv = 'gbp_usd.csv'
titles_csv = 'currency_pair_infogbp_usd.csv' 
pred_horizon = 5
dest_file = "gbp_usd_dataset.csv"

build_dataset(price_csv, titles_csv, pred_horizon, dest_file)