# Assignment 1: Data Collection & Preprocessing

## 1. Setup: Import Libraries and Load Files

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import os
import json

## 2. Define directories for stock prices and tweets

In [2]:
# Directory paths for stock prices and tweets
stock_directory = 'data/price/raw'
tweet_directory = 'data/tweet/raw'

stock_data = {}  # Dictionary to store stock data
tweets_data = {}  # Dictionary to store tweet data

## 3. Load and Preprocess Stock Data

In [3]:
csv_files = [file for file in os.listdir(stock_directory) if file.endswith('.csv')]

for file in csv_files:
    symbol = file.split('.')[0]  # Extract stock symbol from filename
    df = pd.read_csv(os.path.join(stock_directory, file))
    df['Date'] = pd.to_datetime(df['Date'])
    df.fillna(method='ffill', inplace=True)  # Forward fill missing values
    stock_data[symbol] = df

In [4]:
stock_data['GOOG'].head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-09-04,340.996857,341.221008,335.492493,339.248413,339.248413,3793200
1,2012-09-05,338.730347,341.968231,338.301971,339.08902,339.08902,3429100
2,2012-09-06,341.699219,348.638214,341.086517,348.394135,348.394135,6109700
3,2012-09-07,348.693024,354.795135,347.532349,351.756531,351.756531,6490100
4,2012-09-10,353.554779,355.074097,347.891022,349.076569,349.076569,5139100


## 4. Load and Preprocess Tweets Data

In [5]:
tweet_threshold = 635  # Minimum tweets required for a company

for stock_folder in os.listdir(tweet_directory):
    stock_path = os.path.join(tweet_directory, stock_folder)
    
    if os.path.isdir(stock_path):
        all_tweets = []
        count = 0  # Count of tweet files for each stock

        for tweet_file in os.listdir(stock_path):
            file_path = os.path.join(stock_path, tweet_file)

            if os.path.isfile(file_path):
                count += 1
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        try:
                            tweet_json = json.loads(line.strip())
                            tweet_data = {
                                'Date': pd.to_datetime(tweet_json['created_at']),
                                'Text': tweet_json['text'],
                                'User': tweet_json['user']['screen_name'],
                                'Followers': tweet_json['user']['followers_count'],
                                'Friends': tweet_json['user']['friends_count']
                            }
                            all_tweets.append(tweet_data)
                        except json.JSONDecodeError:
                            print(f"Error decoding JSON in file {file_path}")

        if count >= tweet_threshold:  # Store data only if threshold met
            tweets_data[stock_folder] = pd.DataFrame(all_tweets)

In [6]:
len(tweets_data), list(tweets_data.keys())

(10, ['FB', 'GOOG', 'D', 'BAC', 'AMZN', 'INTC', 'T', 'MSFT', 'AAPL', 'C'])

## 5. Align Dates in Stock and Tweet Data

* Grouping tweets by date and restructure them

In [7]:
for stock, df in tweets_data.items():
    df['Date'] = pd.to_datetime(df['Date']).dt.date  # Convert to date only
    grouped = df.groupby('Date').agg({
        'Text': list,
        'User': list,
        'Followers': list,
        'Friends': list
    }).reset_index()
    tweets_data[stock] = grouped

* Filtering stock data to match tweet data companies

In [8]:
stock_data = {symbol: df for symbol, df in stock_data.items() if symbol in tweets_data}
print(f"Number of companies with both stock and tweet data: {len(stock_data)}")

Number of companies with both stock and tweet data: 10


## Merge Stock and Tweet Data for a Company

* Below is the example of merging stock and tweet data for one company (GOOG)

In [9]:
stock_data['FB']['Date'] = pd.to_datetime(stock_data['FB']['Date'])
tweets_data['FB']['Date'] = pd.to_datetime(tweets_data['FB']['Date'])

merged_data = stock_data['FB'].merge(tweets_data['FB'], on='Date', how='left')
merged_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Text,User,Followers,Friends
0,2012-09-04,18.08,18.27,17.549999,17.73,17.73,46622400,,,,
1,2012-09-05,18.27,18.75,18.18,18.58,18.58,60781800,,,,
2,2012-09-06,18.74,19.26,18.719999,18.959999,18.959999,46066500,,,,
3,2012-09-07,19.1,19.42,18.780001,18.98,18.98,36371700,,,,
4,2012-09-10,19.059999,19.200001,18.549999,18.809999,18.809999,24797800,,,,


## Apply the Same Merging Process for All Companies

In [10]:
for stock in stock_data.keys():
    if stock in tweets_data:
        stock_data[stock]['Date'] = pd.to_datetime(stock_data[stock]['Date'])
        tweets_data[stock]['Date'] = pd.to_datetime(tweets_data[stock]['Date'])

        stock_data[stock] = stock_data[stock].merge(tweets_data[stock], on='Date', how='left')

* Displaying filtered merged data for GOOG (if it has tweet data)

In [11]:
filtered_data = stock_data['AMZN'][stock_data['AMZN']['Text'].notna()]
filtered_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Text,User,Followers,Friends
333,2014-01-02,398.799988,399.359985,394.019989,397.970001,397.970001,2137800,[RT @MadKindlePromos: Several incredible novel...,"[JohnRosePutnam, waheedfaizi]","[881, 89]","[860, 210]"
334,2014-01-03,398.290009,402.709991,396.220001,396.440002,396.440002,2210200,"[The first employees at Google, Amazon and Sub...","[AlvaroConnell, CNBCJosh]","[1, 2668]","[0, 279]"
335,2014-01-06,395.850006,397.0,388.420013,393.630005,393.630005,3170600,[RT @SteveTappin: 9 Bar Charts: Apple vs. Amaz...,"[WorldOfCEOs, GavinGreenberg, InvestEdInc]","[250614, 66, 261]","[39119, 0, 328]"
336,2014-01-07,395.040009,398.470001,394.290009,398.029999,398.029999,1916000,"[If $AMZN starts accepting #bitcoin, #bitcoin ...","[insidemarkets, mjwmonty, QP_Service, QP_Service]","[68, 50, 23, 24]","[135, 90, 1, 1]"
337,2014-01-08,398.470001,403.0,396.040009,401.920013,401.920013,2316500,[RT @WSJ: A look inside Amazon's rigorous hiri...,"[FAnjum1, GavinGreenberg, fin_vestor, fuz1on, ...","[83, 70, 249, 402, 485, 58, 140, 9]","[118, 0, 0, 2001, 169, 536, 222, 0]"


## Displaying Basic Info of filtered_data

In [12]:
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 529 entries, 333 to 897
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       529 non-null    datetime64[ns]
 1   Open       529 non-null    float64       
 2   High       529 non-null    float64       
 3   Low        529 non-null    float64       
 4   Close      529 non-null    float64       
 5   Adj Close  529 non-null    float64       
 6   Volume     529 non-null    int64         
 7   Text       529 non-null    object        
 8   User       529 non-null    object        
 9   Followers  529 non-null    object        
 10  Friends    529 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(4)
memory usage: 49.6+ KB


In [13]:
filtered_data.describe()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,529,529.0,529.0,529.0,529.0,529.0,529.0
mean,2015-02-15 22:43:46.843100160,424.825047,429.493422,419.34949,424.561891,424.561891,4157928.0
min,2014-01-02 00:00:00,284.399994,290.420013,284.0,286.950012,286.950012,1091200.0
25%,2014-07-18 00:00:00,327.799988,331.720001,323.269989,327.23999,327.23999,2679500.0
50%,2015-02-24 00:00:00,378.410004,383.0,375.839996,379.0,379.0,3600100.0
75%,2015-09-10 00:00:00,527.650024,532.599976,519.219971,526.030029,526.030029,4674500.0
max,2016-03-31 00:00:00,691.890015,696.440002,686.380005,693.969971,693.969971,23856100.0
std,,114.944392,116.202614,113.041267,114.735843,114.735843,2575615.0


In [14]:
filtered_data.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
Text         0
User         0
Followers    0
Friends      0
dtype: int64

## Summary

In [15]:
print("Total companies with merged stock and tweet data:", len(stock_data))
print("Companies:", list(stock_data.keys()))

Total companies with merged stock and tweet data: 10
Companies: ['GOOG', 'BAC', 'AMZN', 'MSFT', 'T', 'D', 'FB', 'AAPL', 'C', 'INTC']


# Assignment 2: Further Data Preprocessing and Initial Model Deployment

* We will implement LSTM, and as of now, we will ignore tweets and some other columns.

In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt

# Step 1: Preprocessing
# Drop unnecessary columns
data = filtered_data.drop(['Text', 'User', 'Followers', 'Friends'], axis=1)

# Sort by date and set 'Date' as index (optional)
data = data.sort_values('Date').set_index('Date')

# Feature scaling using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

In [17]:
# Step 2: Prepare the data for LSTM
def create_sequences(data, time_step=60):
    X, y = [], []
    for i in range(time_step, len(data)):
        X.append(data[i-time_step:i, :])  # 60 timesteps of input
        y.append(data[i, 3])  # Target: 'Close' column at index 3
    return np.array(X), np.array(y)

# Create sequences with a window size of 60
time_step = 60
X, y = create_sequences(scaled_data, time_step)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [18]:
# Step 3: Define the LSTM Model
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(StockLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])  # Get output from the last time step
        return out

# Initialize model, loss function, and optimizer
input_size = X_train.shape[2]  # Number of features (7 in our case)
hidden_size = 50
num_layers = 2
output_size = 1

model = StockLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [19]:
# Step 4: Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.0336
Epoch [20/100], Loss: 0.0196
Epoch [30/100], Loss: 0.0038
Epoch [40/100], Loss: 0.0036
Epoch [50/100], Loss: 0.0018
Epoch [60/100], Loss: 0.0018
Epoch [70/100], Loss: 0.0015
Epoch [80/100], Loss: 0.0014
Epoch [90/100], Loss: 0.0014
Epoch [100/100], Loss: 0.0013


In [20]:
# Step 5: Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(X_test).cpu().numpy()

# Create a placeholder array with the same number of columns as the original data (7)
# Populate only the 'Close' column (index 3) with predictions
predictions_padded = np.zeros((predictions.shape[0], scaled_data.shape[1]))
predictions_padded[:, 3] = predictions.flatten()  # Index 3 is 'Close'

y_test_padded = np.zeros((y_test.shape[0], scaled_data.shape[1]))
y_test_padded[:, 3] = y_test.numpy().flatten()

# Inverse transform to get back to the original scale
predictions_rescaled = scaler.inverse_transform(predictions_padded)[:, 3]
y_test_rescaled = scaler.inverse_transform(y_test_padded)[:, 3]

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 6: Calculate accuracy metrics
mae = mean_absolute_error(y_test_rescaled, predictions_rescaled)
mse = mean_squared_error(y_test_rescaled, predictions_rescaled)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_rescaled, predictions_rescaled)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error (MAE): 21.1455
Mean Squared Error (MSE): 715.7555
Root Mean Squared Error (RMSE): 26.7536
R² Score: 0.7725


* As we have ten companies data we will work on, but as of now, we had implemented model on only one company data (AMZN)
* Also, as of now we have just used stock data for analysis, we havn't used tweet sentiments, we'll also make use of tweet sentiments in our model later on
* Also, we have just implemented the model, the hyperparameter tuning and other validation and testing steps we'll do later.