## Sentiment Analysis in Finance

### Logistic Regression Classification Model: Alpha Vantage Dataset

##### Team 103: Anna Brunkhorst, Nader Lobandi, Ashish Magadum

Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Check for necessary folders & files

In [2]:
import os
import sys

# List of required files in the ./data directory
required_files = [
    './data/alpha_vantage_train.csv',
    './data/alpha_vantage_test.csv',
    './data/glove.6B.100d.txt'
]

# Check if the ./data directory exists
if not os.path.exists('./data'):
    print("STOP: ./data folder missing from this directory.")
    sys.exit(1)  # Stop the notebook from running any further

# Check for each required file in the list
missing_files = False
for file_path in required_files:
    if not os.path.exists(file_path):
        print(f"STOP: {os.path.basename(file_path)} data file missing from ./data folder.")
        missing_files = True

if missing_files:
    sys.exit(1)  # Exit if any file is missing

print("Necessary data files found in ./data.")

# Check if the ./models directory exists, and create it if it does not
models_path = './models'
if not os.path.exists(models_path):
    os.makedirs(models_path)
    print("./models folder created!")
else:
    print("./models folder found.")

Necessary data files found in ./data.
./models folder found.


Define data preprocessing function

In [3]:
def preprocess_data(file_path):
    data = pd.read_csv(file_path)
    nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
    
    def cleaning(doc):
        txt = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        return ' '.join(txt)
    
    brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in data['text'])
    cleaned_txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=2500, n_process=-1)]
    data['cleaned_text'] = cleaned_txt
    
    return data[['cleaned_text', 'label']]

Preprocess and save data

In [4]:
# Preprocess training and testing data separately
train_data_processed = preprocess_data('./data/alpha_vantage_train.csv')
print("Training data preprocessed!")
test_data_processed = preprocess_data('./data/alpha_vantage_test.csv')
print("Testing data preprocessed!")

# Save preprocessed data to new CSV files
train_data_processed.to_csv('./data/alpha_vantage_train_processed.csv', index=False)
test_data_processed.to_csv('./data/alpha_vantage_test_processed.csv', index=False)
print("Preprocessed data saved.")

Training data preprocessed!
Testing data preprocessed!
Preprocessed data saved.


Function to load GLoVe model from file of embeddings

In [5]:
def load_glove_model(glove_file):
    with open(glove_file, 'r', encoding='utf8') as f:
        model = {}
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            model[word] = embedding
        return model

Function to process a text document to produce a single vector representation

In [6]:
def document_vector(doc, glove_model):
    words = doc.split()
    word_vectors = [glove_model.get(word, np.zeros(100)) for word in words]
    vector = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)
    return vector

Function to load and vectorize data from a given file

In [7]:
def load_and_vectorize_data(file_path):
    data = pd.read_csv(file_path)
    data['doc_vector'] = data['cleaned_text'].apply(lambda doc: document_vector(doc, glove_model))
    X = np.array(data['doc_vector'].tolist())
    y = data['label'].values
    return X, y

Loading in GLoVe model and train and test data

In [8]:
glove_model = load_glove_model('./data/glove.6B.100d.txt')

X_train, y_train = load_and_vectorize_data('./data/alpha_vantage_train_processed.csv')
X_test, y_test = load_and_vectorize_data('./data/alpha_vantage_test_processed.csv')

Running model

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

Print classification report

In [10]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.01      0.02        94
           1       0.57      0.35      0.44      1089
           2       0.55      0.66      0.60      3004
           3       0.58      0.63      0.60      2492
           4       0.54      0.19      0.28       473

    accuracy                           0.56      7152
   macro avg       0.51      0.37      0.39      7152
weighted avg       0.56      0.56      0.55      7152



Save report and model in ./models

In [11]:
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('./models/alpha_vantage_log_reg_report.csv', index=True)

joblib.dump(model, './models/alpha_vantage_log_reg.pkl')

['./models/alpha_vantage_log_reg.pkl']

### Unit Tests

In [12]:
import unittest
import os

class TestSentimentAnalysis(unittest.TestCase):
    
    def test_preprocess_data(self):
        """ Test the preprocessing function to ensure it outputs the expected format """
        sample_data = pd.DataFrame({
            'text': ['This is a sample!'],
            'label': [1]
        })
        sample_data.to_csv('sample_data.csv', index=False)
        
        processed_data = preprocess_data('sample_data.csv')
        os.remove('sample_data.csv')  # Clean up the sample file
        
        self.assertIn('cleaned_text', processed_data.columns)
        self.assertIn('label', processed_data.columns)
        self.assertEqual(processed_data['label'].iloc[0], 1)
        self.assertEqual(type(processed_data['cleaned_text'].iloc[0]), str)

    def test_load_glove_model(self):
        """ Test that the GloVe model loads correctly and vector dimensions are right """
        glove_sample = './data/glove_sample.txt'
        with open(glove_sample, 'w') as f:
            f.write('hello 0.1 0.2 0.3 0.4\nworld 0.5 0.6 0.7 0.8\n')
        
        glove = load_glove_model(glove_sample)
        os.remove(glove_sample)  # Clean up the sample file
        
        self.assertIn('hello', glove)
        self.assertIn('world', glove)
        self.assertEqual(len(glove['hello']), 4)
        self.assertTrue(np.array_equal(glove['hello'], np.array([0.1, 0.2, 0.3, 0.4])))

    def test_document_vector(self):
        """ Test the document vector function produces correct output """
        glove_sample = {'test': np.array([1, 1, 1, 1])}
        doc = 'test test test'
        result_vector = document_vector(doc, glove_sample)
        
        self.assertEqual(result_vector.shape[0], 4)
        self.assertTrue(np.array_equal(result_vector, np.array([1, 1, 1, 1])))

# Run the tests
unittest.main(argv=[''], verbosity=2, exit=False)

test_document_vector (__main__.TestSentimentAnalysis.test_document_vector)
Test the document vector function produces correct output ... ok
test_load_glove_model (__main__.TestSentimentAnalysis.test_load_glove_model)
Test that the GloVe model loads correctly and vector dimensions are right ... ok
test_preprocess_data (__main__.TestSentimentAnalysis.test_preprocess_data)
Test the preprocessing function to ensure it outputs the expected format ... ok

----------------------------------------------------------------------
Ran 3 tests in 33.875s

OK


<unittest.main.TestProgram at 0x15718c8e7e0>