## Sentiment Analysis in Finance

### Logistic Regression Classification Model: Yahoo Dataset

##### Team 103: Anna Brunkhorst, Nader Lobandi, Ashish Magadum

#### Importing Necessary Libraries

In [1]:
# Importing necessary libraries
!pip install datasets
from datasets import load_dataset
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

!pip install gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abrun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abrun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abrun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Check for Necessary Folders & Files

In [2]:
import os
import sys

# Check for the existence of the Word2Vec embeddings file
word2vec_path = './data/GoogleNews-vectors-negative300.bin.gz'
if not os.path.exists(word2vec_path):
    print("STOP: Please ensure existence of ./data folder in this directory, and place necessary Word2Vec embedding file 'GoogleNews-vectors-negative300.bin.gz' into the data folder before running.")
    sys.exit(1)  # Stop the notebook from running any further
else:
    print("Necessary data found in ./data.")

# Check if the ./models directory exists, and create it if it does not
models_path = './models'
if not os.path.exists(models_path):
    os.makedirs(models_path)
    print("./models folder created!")
else:
    print("./models folder found.")

Necessary data found in ./data.
./models folder found.


#### Load Datasets

In [3]:
# Load Yahoo-Finance-News-Sentences dataset from Huggingface
yahoo_ds = load_dataset("ugursa/Yahoo-Finance-News-Sentences")

# Convert to Pandas DF for easier manipulation
yahoo_df = pd.DataFrame(yahoo_ds['train'])

# Check some data
yahoo_df.head()

Unnamed: 0,label,text
0,0,Chinese-owned companies are aggressively expan...
1,0,Chinese cobalt producers have seemed unfazed b...
2,0,"China's CMOC Group, which boosted its cobalt o..."
3,0,CMOC is due to lift its market share of the gl...
4,1,Its Kisanfu mine in Democratic Republic of Con...


#### Preprocess Data

In [4]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if not word in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Clone DF for preprocessing
yahoo_df_processed = yahoo_df.copy()

# Apply preprocessing to each text entry
yahoo_df_processed['text'] = yahoo_df['text'].apply(preprocess_text)

# Check some data
yahoo_df_processed.head()

Unnamed: 0,label,text
0,0,chineseowned company aggressively expanding co...
1,0,chinese cobalt producer seemed unfazed oversup...
2,0,china cmoc group boosted cobalt output 144 fir...
3,0,cmoc due lift market share global mined cobalt...
4,1,kisanfu mine democratic republic congo drc par...


#### Vectorize Data with Pre-Trained Word2Vec Model

Convert data into lists of words compatible with Word2Vec

In [5]:
yahoo_df_processed['tokenized_text'] = yahoo_df_processed['text'].apply(lambda x: x.split())

Load pre-trained model (must be downloaded in this folder to load)

In [6]:
word_vectors = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz', binary=True)

Function to vectorize a sentence

In [7]:
def vectorize_sentence(tokens, model=word_vectors, vector_size=300):
    word_vecs = [model[word] for word in tokens if word in model]
    if len(word_vecs) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(word_vecs, axis=0)

Apply vectorization to each entry

In [8]:
yahoo_df_processed['wordvec'] = yahoo_df_processed['tokenized_text'].apply(vectorize_sentence)

Print head again to check data

In [9]:
yahoo_df_processed.head()

Unnamed: 0,label,text,tokenized_text,wordvec
0,0,chineseowned company aggressively expanding co...,"[chineseowned, company, aggressively, expandin...","[-0.024353027, 0.045820758, 0.016967773, 0.040..."
1,0,chinese cobalt producer seemed unfazed oversup...,"[chinese, cobalt, producer, seemed, unfazed, o...","[-0.054423742, 0.0140860425, -0.028483799, 0.0..."
2,0,china cmoc group boosted cobalt output 144 fir...,"[china, cmoc, group, boosted, cobalt, output, ...","[-0.072459504, 0.04049862, 0.0203642, 0.070950..."
3,0,cmoc due lift market share global mined cobalt...,"[cmoc, due, lift, market, share, global, mined...","[-0.020656586, -0.015205383, 0.00045776367, 0...."
4,1,kisanfu mine democratic republic congo drc par...,"[kisanfu, mine, democratic, republic, congo, d...","[-0.057576496, -0.025512695, 0.17758179, 0.144..."


#### Save to CSV

In [10]:
yahoo_df_processed.to_csv('./data/yahoo_processed.csv', index=False)

#### Run Logistic Regression model

Load the preprocessed data

In [11]:
data = pd.read_csv('./data/yahoo_processed.csv')

Prepare the data

In [12]:
X = np.stack(data['wordvec'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ')))
y = data['label'].values

Split the data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the logistic regression model

In [14]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

Evaluate the model

In [15]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.62      0.66      1444
           1       0.77      0.84      0.80      2185
           2       0.68      0.66      0.67      1378

    accuracy                           0.73      5007
   macro avg       0.72      0.71      0.71      5007
weighted avg       0.72      0.73      0.72      5007



Save report and model in ./models

In [16]:
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('./models/yahoo_log_reg_report.csv', index=True)

joblib.dump(model, './models/yahoo_log_reg.pkl')

['./models/yahoo_log_reg.pkl']

#### Unit Tests

In [17]:
import unittest

class TestYahooSentimentAnalysis(unittest.TestCase):
    def setUp(self):
        # Load a sample from the dataset for testing
        self.dataset = load_dataset("ugursa/Yahoo-Finance-News-Sentences", split='train[:1%]')

    def test_data_loading(self):
        """Ensure the dataset is loaded correctly."""
        self.assertIsNotNone(self.dataset)
        self.assertIn('text', self.dataset.column_names)

    def test_text_preprocessing(self):
        """Check that text preprocessing removes punctuation and lowercases correctly."""
        sample_text = "Hello WORLD! Testing, one-two-three."
        expected_output = "hello world testing onetwothree"
        processed_text = preprocess_text(sample_text)
        self.assertEqual(processed_text, expected_output)

    def test_vectorization(self):
        """Ensure that vectorization returns vectors of expected length."""
        sample_tokens = ["hello", "world", "unknownword"]
        vector = vectorize_sentence(sample_tokens, model=word_vectors, vector_size=300)
        self.assertEqual(len(vector), 300)
        self.assertFalse(np.all(vector == 0))  # Not all zeros

    def test_model_predictions(self):
        """Check if the model can predict on a small set of data."""
        sample_text = ["hello world", "test sentence"]
        tokenized_text = [text.split() for text in sample_text]
        vectors = np.array([vectorize_sentence(tokens) for tokens in tokenized_text])
        
        # Assuming 'model' is already trained and loaded
        predictions = model.predict(vectors)
        self.assertEqual(len(predictions), 2)

# Running the tests
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False, verbosity=2)

test_data_loading (__main__.TestYahooSentimentAnalysis.test_data_loading)
Ensure the dataset is loaded correctly. ... ok
test_model_predictions (__main__.TestYahooSentimentAnalysis.test_model_predictions)
Check if the model can predict on a small set of data. ... ok
test_text_preprocessing (__main__.TestYahooSentimentAnalysis.test_text_preprocessing)
Check that text preprocessing removes punctuation and lowercases correctly. ... ok
test_vectorization (__main__.TestYahooSentimentAnalysis.test_vectorization)
Ensure that vectorization returns vectors of expected length. ... ok

----------------------------------------------------------------------
Ran 4 tests in 3.917s

OK
