## Financial Sentiment Analysis

##### CS6120 - Natural Language Processing

##### Anna Brunkhorst, Nader Lobandi, Ashish Magadum

#### Importing Necessary Libraries

In [1]:
# Importing necessary libraries
!pip install datasets
from datasets import load_dataset
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abrun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abrun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abrun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Load Datasets

In [2]:
# Load Yahoo-Finance-News-Sentences dataset from Huggingface
ugursa_ds = load_dataset("ugursa/Yahoo-Finance-News-Sentences")

# Convert to Pandas DF for easier manipulation
ugursa_df = pd.DataFrame(ugursa_ds['train'])

# Check some data
ugursa_df.head()

Unnamed: 0,label,text
0,0,Chinese-owned companies are aggressively expan...
1,0,Chinese cobalt producers have seemed unfazed b...
2,0,"China's CMOC Group, which boosted its cobalt o..."
3,0,CMOC is due to lift its market share of the gl...
4,1,Its Kisanfu mine in Democratic Republic of Con...


#### Preprocess Data

In [3]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if not word in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Clone DF for preprocessing
ugursa_df_pre = ugursa_df.copy()

# Apply preprocessing to each text entry
ugursa_df_pre['text'] = ugursa_df['text'].apply(preprocess_text)

# Check some data
ugursa_df_pre.head()

Unnamed: 0,label,text
0,0,chineseowned company aggressively expanding co...
1,0,chinese cobalt producer seemed unfazed oversup...
2,0,china cmoc group boosted cobalt output 144 fir...
3,0,cmoc due lift market share global mined cobalt...
4,1,kisanfu mine democratic republic congo drc par...


#### Vectorize Data with Pre-Trained Word2Vec Model

Import necessary library

In [4]:
!pip install gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors


[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip




Convert data into lists of words compatible with Word2Vec

In [5]:
ugursa_df_pre['tokenized_text'] = ugursa_df_pre['text'].apply(lambda x: x.split())

Load pre-trained model (must be downloaded in this folder to load)

In [7]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

Function to vectorize a sentence

In [8]:
def vectorize_sentence(tokens, model=word_vectors, vector_size=300):
    word_vecs = [model[word] for word in tokens if word in model]
    if len(word_vecs) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(word_vecs, axis=0)

Apply vectorization to each entry

In [9]:
ugursa_df_pre['wordvec'] = ugursa_df_pre['tokenized_text'].apply(vectorize_sentence)

Print head again to check data

In [10]:
ugursa_df_pre.head()

Unnamed: 0,label,text,tokenized_text,wordvec
0,0,chineseowned company aggressively expanding co...,"[chineseowned, company, aggressively, expandin...","[-0.024353027, 0.045820758, 0.016967773, 0.040..."
1,0,chinese cobalt producer seemed unfazed oversup...,"[chinese, cobalt, producer, seemed, unfazed, o...","[-0.054423742, 0.0140860425, -0.028483799, 0.0..."
2,0,china cmoc group boosted cobalt output 144 fir...,"[china, cmoc, group, boosted, cobalt, output, ...","[-0.072459504, 0.04049862, 0.0203642, 0.070950..."
3,0,cmoc due lift market share global mined cobalt...,"[cmoc, due, lift, market, share, global, mined...","[-0.020656586, -0.015205383, 0.00045776367, 0...."
4,1,kisanfu mine democratic republic congo drc par...,"[kisanfu, mine, democratic, republic, congo, d...","[-0.057576496, -0.025512695, 0.17758179, 0.144..."
