In [1]:
import numpy as np
from utils import *
import pandas as pd
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from word2number import w2n

from datetime import datetime, timedelta


In [2]:
data_dir = "./lithium_dataset"
cleaned_data_dir = "./lithium_dataset/cleaned_data"

## Price Data Ingestion + Processing
- Consider lithium future, lithium metal spot, Li2CO3, and LIOH prices are potential regressands
- Clean each by calculating all available log returns with lags 1, 7, and 14 days
- According to Professor Geard, we'll only use data from 2021, which would include on average 500-600 datapoints
- Store them in jerry/lithium_dataset/cleaned_data

In [3]:
def convert_string_to_numeric(s):
    """
    Convert strings like '5.74M' or '532.50K' to numeric values
    """
    s = str(s).replace(',', '')  # Remove commas
    if 'M' in s:
        return float(s.replace('M', '')) * 1e6
    elif 'K' in s:
        return float(s.replace('K', '')) * 1e3
    else:
        return float(s)


def process_price_dataframe(df, price_cols, date_format, lags):

    # Convert columns in price_cols from string to numeric
    for col in price_cols:
        if type(df.loc[0,col]) == str:
            df[col] = df[col].apply(convert_string_to_numeric)
        else: 
            df[col] = df[col].astype(float)

    # Processing Dates
    df['Date'] = df['Date'].str.replace('.', '')
    df['Date'] = pd.to_datetime(df['Date'], format=date_format)
    for lag in lags:
        df[f'lag_{lag}'] = df.loc[:,'Date'] - timedelta(days=lag)
    df.set_index('Date', inplace=True)
    df = df.sort_index(ascending=True)
    for lag in lags:
        df[f'lag_{lag}_price'] = df[f'lag_{lag}'].apply(lambda x: df.loc[x,'Close'] if x in df.index else np.NaN)
        df = df.drop(columns=[f'lag_{lag}'])
        df[f'lag_{lag}_log_return'] = np.log(df['Close'] / df[f'lag_{lag}_price'])

    return df

In [4]:
regressands = ['60C-LTCX', '99C-LTCB', '995C-LTCB', '99MIN-LTMT', '999MIN-LTMT']
lags = [1,7,14]
all_data = {k: None for k in regressands}

all_data['li_future'] = pd.read_excel(f"{data_dir}/Historical Prices (9).xlsx")[['Date', 'Close']]
all_data['li2co3'] = pd.read_csv(f"{data_dir}/Lithium Carbonate (wind database).csv").rename(columns={'Li2CO3 99%': 'Close'})
all_data['lioh'] = pd.read_csv(f"{data_dir}/Lithium Hydroxide (wind database).csv").rename(columns={'LiOH 56.5%': 'Close'})
for reg in regressands:
    all_data[reg] = pd.read_csv(f"{data_dir}/{reg}.csv")[['Date', 'Price']].rename(columns={'Price':'Close'})

for reg in all_data.keys():
    if reg == 'li_future':
        all_data[reg] = process_price_dataframe(all_data[reg], price_cols=['Close'], date_format='%b %d, %Y', lags=lags)
    elif reg in ['li2co3', 'lioh']:
        all_data[reg] = process_price_dataframe(all_data[reg], price_cols=['Close'], date_format='%Y-%m-%d', lags=lags)
    else:
        all_data[reg] = process_price_dataframe(all_data[reg], price_cols=['Close'], date_format='%m/%d/%Y', lags=lags)
    all_data[reg].to_csv(f"{cleaned_data_dir}/{reg}_cleaned.csv")

all_data['li2co3']

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,Close,lag_1_price,lag_1_log_return,lag_7_price,lag_7_log_return,lag_14_price,lag_14_log_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-04,23200.0,,,,,,
2005-01-05,23200.0,23200.0,0.000000,,,,
2005-01-06,23200.0,23200.0,0.000000,,,,
2005-01-07,23200.0,23200.0,0.000000,,,,
2005-01-10,23200.0,,,,,,
...,...,...,...,...,...,...,...
2023-05-16,265000.0,258000.0,0.026770,195500.0,0.304169,,
2023-05-17,274000.0,265000.0,0.033398,208500.0,0.273189,,
2023-05-18,290000.0,274000.0,0.056753,225000.0,0.253781,179500.0,0.479706
2023-05-19,292000.0,290000.0,0.006873,242000.0,0.187816,180500.0,0.481023


## News Data Ingestion
- Use lithium_merged.csv as the source of all Lithium-related news
- Only keep news url and tone, which would be used to fetch all text contents --> NLP

In [5]:
news = pd.read_csv(f'{data_dir}/lithium_merged.csv')
news.set_index('DATE', inplace=True)
news.index = pd.to_datetime(news.index, format = '%Y%m%d%H%M%S')
news = news.sort_index(ascending = True)['2017-05-02':]
news = news[['DocumentIdentifier', 'V2Tone']].rename(columns={'DocumentIdentifier': 'url', 'V2Tone': 'tone'})

display(news)

Unnamed: 0_level_0,url,tone
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-02 13:30:00,https://www.insiderfinancial.com/lithium-x-ene...,0.818554
2017-05-02 15:30:00,http://www.prnewswire.com/news-releases/hotter...,-1.374570
2017-05-03 06:00:00,http://www.einnews.com/pr_news/379071017/power...,0.000000
2017-05-03 11:00:00,http://www.einnews.com/pr_news/379118179/nemas...,-0.092593
2017-05-03 13:30:00,http://www.finanznachrichten.de/nachrichten-20...,0.207469
...,...,...
2023-05-30 22:00:00,https://www.sandiegoreader.com/news/2023/may/3...,-4.336043
2023-05-31 00:30:00,https://www.havasunews.com/nation/could-the-ru...,-3.505911
2023-05-31 12:00:00,https://www.finanznachrichten.de/nachrichten-2...,-0.213447
2023-05-31 13:00:00,https://www.finanznachrichten.de/nachrichten-2...,3.892028


## Combine News & Price Data to Generate Labels

In [6]:
### Combine news and corresponding price data
news['date'] = [datetime.date(d).strftime('%Y-%m-%d') for d in news.index] 
def get_log_return(price_df, lag, timestamp):
    if timestamp in price_df.index:
        return price_df.loc[timestamp, lag]
    else:
        return np.NaN

for col in ['Close', 'lag_1_log_return', 'lag_7_log_return', 'lag_14_log_return']:
    news[col] = news['date'].apply(lambda x: get_log_return(all_data['li_future'], col, x))


### Generate labels using the given column & thresholds
thresholds = [0,0]
def num_to_label(num):
    if num < thresholds[0]:
        return 'SELL'
    elif num > thresholds[1]:
        return 'BUY'
    else:
        return 'HOLD'

for lag in [1,7,14]:
    news[f'lag_{lag}_label'] = news[f'lag_{lag}_log_return'].apply(lambda x: num_to_label(x))


### Final cleaning & saving data
news = news.drop(columns=['date'])['2021-01-01':]
news = news.dropna()
news.to_csv(f"{cleaned_data_dir}/news_price_cleaned.csv")

## News Text Preprocessing: 
- get news content using newspaper library
- clean content using Spacy, NLTK, etc. Steps include:
    - Remove noise including urls, accented characters, mentions & hashtags, punctuations, and extra whitespaces
    - Convert all content to lowercase
    - Remove stopwords defined by spacy & manual input
    - Remove numbers
    - Lemmatize the text
- Store all text content corresponding to each news article in a large df, removing unfetchable news --> save csv

In [7]:
sw_nltk = nltk.corpus.stopwords.words("english")
sw_spacy = spacy_nlp.Defaults.stop_words
exclude_stopwords = ['one', 'two', 'three', 'four', 'five', 'six', 'eight', 'nine', 'ten', 'twelve', 'fifteen', 'twenty', 'forty', 'fifty', 'sixty', 'hundred', 'not', ]
include_stopwords = ['email', 'phone', 'contact', 'information', 'link', 'tel']
sw_spacy = [word for word in sw_spacy if word not in exclude_stopwords]
sw_spacy = sw_spacy + include_stopwords
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()

news_text = news.copy()
news_text['text'] = news_text.loc[:,'url'].apply(lambda x: url_to_clean_text(url=x, stopwords=sw_spacy, lemmatizer=wordnet_lemmatizer, remove_num_or_not=True, language=None))
news_text = news_text[(news_text['text'] != 1) & (news_text['text'] != '')]

display(news_text)
display(news_text.info())

In [None]:
news_text.to_csv('news_text.csv')