In [1]:
import pandas as pd
import numpy as np
import ast
import re

### Data Cleaning and Preprocessing
- Selects only the required columns
- Drops NaN records and duplicates
- Extracts ticker information for only the specified ticker in question
- Extracts topics and relevance for each news record
- Cleans title and summary text of news record
- Sets timestamp as index

In [2]:
all_topics = []

def clean_data(df):
    copy_df = df.copy()
    copy_df = copy_df[['title', 'time_published', 'summary', 'topics', 'overall_sentiment_score', 'ticker_sentiment']]
    
    for topic_list in copy_df['topics']:
        topic_list = ast.literal_eval(topic_list)
        for topic_dict in topic_list:
            if topic_dict['topic'] not in all_topics:
                all_topics.append(topic_dict['topic'])

    copy_df[['ticker_relevance', 'ticker_sentiment']] = copy_df['ticker_sentiment'].apply(add_ticker_info)
    weighted_df = copy_df['topics'].apply(add_topic_relevance)
    weighted_df = weighted_df.rename(columns=lambda x: f"topic_{x}")
    copy_df = pd.concat([copy_df, weighted_df], axis = 1)
    copy_df = copy_df.drop(columns=['topics'])

    copy_df['time_published'] = pd.to_datetime(copy_df['time_published'], format='%Y%m%dT%H%M%S')
    copy_df['time_published'] = copy_df['time_published'].dt.strftime('%Y-%m-%d %H:%M:%S')
    copy_df = copy_df.rename(columns={'time_published': 'timestamp'})
    copy_df = copy_df.drop_duplicates()
    copy_df = copy_df.dropna()
    copy_df['title'] = copy_df['title'].apply(clean_text)
    copy_df['summary'] = copy_df['summary'].apply(clean_text)
    copy_df = copy_df.set_index('timestamp')
    return copy_df

def add_topic_relevance(topics_row):
    if isinstance(topics_row, str):
        topics_list = ast.literal_eval(topics_row)
    else:
        topics_list = topics_row

    topic_weights = {topic: 0.0 for topic in all_topics}

    for topic_info in topics_list:
        topic_name = topic_info['topic']
        relevance = float(topic_info['relevance_score'])
        if topic_name in topic_weights:
            topic_weights[topic_name] = relevance

    return pd.Series(topic_weights)


def add_ticker_info(ticker_row):
    if isinstance(ticker_row, str):
        ticker_list = ast.literal_eval(ticker_row)
    else:
        ticker_list = ticker_row

    for ticker_info in ticker_list:
        if ticker_info['ticker'] == 'AAPL':
            relevance = float(ticker_info['relevance_score'])
            sentiment = float(ticker_info['ticker_sentiment_score'])
            return pd.Series([relevance, sentiment])
    
    return pd.Series([0.0, 0.0])

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
# Read data for all 5 assets into dataframes
aapl_df = pd.read_csv('./datasets/stocks/news/AAPL_news.csv')
tsla_df = pd.read_csv('./datasets/stocks/news/TSLA_news.csv')
amzn_df = pd.read_csv('./datasets/stocks/news/AMZN_news.csv')
btc_df = pd.read_csv('./datasets/crypto/news/BTC_news.csv')
eth_df = pd.read_csv('./datasets/crypto/news/ETH_news.csv')

In [None]:
# Apply cleaning to all 5 datasets
aapl_df = clean_data(aapl_df)
tsla_df = clean_data(tsla_df)
amzn_df = clean_data(amzn_df)
btc_df = clean_data(btc_df)
eth_df = clean_data(eth_df)

### Store Cleaned Datasets

All the news datasets are overwritten with the cleaned and preprocessed data in the news directory.

In [5]:
import os

directories = ["./datasets/stocks/news", "./datasets/crypto/news"]
for dir in directories:
    os.makedirs(dir, exist_ok=True)

aapl_df.to_csv("datasets/stocks/news/AAPL_news.csv")
tsla_df.to_csv("datasets/stocks/news/TSLA_news.csv")
amzn_df.to_csv("datasets/stocks/news/AMZN_news.csv")
eth_df.to_csv("datasets/crypto/news/ETH_news.csv")
btc_df.to_csv("datasets/crypto/news/BTC_news.csv")