In [1]:
# Importing Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

from functools import reduce
from itertools import chain

In [3]:
# Loading in datasets common to all companies
historical_sector_performance = pd.read_csv("market_performance/historical_sector_performance.csv")
sector_pe_ratio = pd.read_csv("market_performance/sector_pe_ratio.csv")
inflation_rates_data = pd.read_csv("inflation_rates_data.csv")
treasury_rates_data = pd.read_csv("treasury_rates_data.csv")

# Setting company names 
commodity_names = ['Palladium', 'Copper', 'Lithium', 'Silver', 'Gold']
# Setting forex names 
forex_names = ['CADUSD', 'CHFUSD', 'CNHUSD', 'KRWUSD', 'EURUSD', 'GBPUSD', 'JPYUSD']

# Dictionaries to store datasets
commodity_data = {}
forex_data = {}

# reading in files
for names in commodity_names:

    try:
        commodity_data[f"{names}_commodity_data"] = pd.read_csv(f"commodity_data/{names}_commodity_data.csv")

    except Exception as e:
        print(f"Error loading {names}: {e}")
        continue

# reading in files
for names in forex_names:

    try:
        forex_data[f"{names}_forex_data"] = pd.read_csv(f"forex_data/{names}_forex_data.csv")

    except Exception as e:
        print(f"Error loading {names}: {e}")
        continue



# Loading required datasets unique to each company
company_list = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']

company_stock_prices = {}
company_technical_indicators = {}
company_complete_news_data = {}
for company in company_list:
    try:
        company_stock_prices[f"{company}_stock_prices"] = pd.read_csv(f"Stock_Price_Data/{company}_stock_prices.csv")
        company_technical_indicators[f"{company}_technical_indicators"] = pd.read_csv(f"technical_indicators/Merged_Technical_Indicators/{company}_Technical_Indicators.csv")
        company_technical_indicators[f"{company}_technical_indicators"].rename(columns={'volume' : 'tech_ind_traded_volume'}, inplace=True)
        company_complete_news_data[f"{company}_complete_news_data"] = pd.read_csv(f"complete_news_data/{company}_complete_news_data.csv")
    
    except Exception as e:
        print(f"Error loading {company}: {e}")
        continue

company_complete_news_data['AAPL_complete_news_data']

Unnamed: 0,date,AAPL_stock_news_sentiment,AAPL_press_release_sentiment,AAPL_twitter_social_sentiment
0,01-01-2020,weakly_positive,,
1,01-01-2021,weakly_positive,,
2,01-01-2022,weakly_positive,,
3,01-01-2023,weakly_positive,,
4,01-01-2024,weakly_positive,,
...,...,...,...,...
2428,31-12-2020,weakly_positive,,
2429,31-12-2021,neutral,,
2430,31-12-2022,weakly_positive,,
2431,31-12-2023,weakly_positive,,


In [4]:
duplicates = historical_sector_performance.duplicated()

print("Duplicate rows in training data:")
print(duplicates.value_counts())

Duplicate rows in training data:
False    5131
Name: count, dtype: int64


In [5]:
historical_sector_performance.drop_duplicates(inplace=True)

print('Duplicates on historical sector data removed. Number of rows remaining:', historical_sector_performance.shape[0])

Duplicates on historical sector data removed. Number of rows remaining: 5131


In [6]:
# Creating loop to join all datasets for each company
company_symbol_list = ["AAPL", "AMZN", "GOOG", "MSFT", "META", "NVDA"]

datasets_lists = {}
full_datasets = {}

# creating lists with datasets for each company and looping through each company
for company_symbol in company_symbol_list:
    datasets_lists[f"{company_symbol}_full_dataset"] = [company_stock_prices[f'{company_symbol}_stock_prices'],
                                                       sector_pe_ratio,
                                                       historical_sector_performance,
                                                       treasury_rates_data, 
                                                       inflation_rates_data, 
                                                       *[commodity_data[company] for company in commodity_data],
                                                       *[forex_data[forex] for forex in forex_data],
                                                       company_technical_indicators[f'{company_symbol}_technical_indicators'],
                                                       company_complete_news_data[f'{company_symbol}_complete_news_data']]
    

    # joining up multiple datasets simultaneously
    full_datasets[f"{company_symbol}_raw_complete_data"] = reduce(lambda left, right: pd.merge(left, right, on='date', how='left'), datasets_lists[f"{company_symbol}_full_dataset"])


# Creating Nvidia data to add
nvidia_data_to_add = company_stock_prices['NVDA_stock_prices'].merge(company_complete_news_data['NVDA_complete_news_data'], on='date', how='left')


# Adding Nvidia stock price features and Nvidia news sentiment features to all datasets (excluding Nvidia)
for company_symbol in company_symbol_list:
    
    if company_symbol == "NVDA":
        pass

    else:
        full_datasets[f"{company_symbol}_raw_complete_data"] = full_datasets[f"{company_symbol}_raw_complete_data"].merge(nvidia_data_to_add, on='date', how='left')
    
    # Writing all company datasets to csv after organising data
    full_datasets[f"{company_symbol}_raw_complete_data"].to_csv(f"full_complete_datasets/{company_symbol}_raw_complete_data.csv", index=False)
    print(f"Finished collecting and organising data for {company_symbol}.")

Finished collecting and organising data for AAPL.
Finished collecting and organising data for AMZN.
Finished collecting and organising data for GOOG.
Finished collecting and organising data for MSFT.
Finished collecting and organising data for META.
Finished collecting and organising data for NVDA.
