In [85]:
import nltk
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
from gensim.models import Word2Vec

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [86]:
# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [87]:
df = pd.read_csv('data/financial sentiment.csv', header=0)
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [88]:
# Create a count table for the 'Sentiment' column
sentiment_counts = df['Sentiment'].value_counts()
sentiment_counts

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64

In [89]:
# Preprocessing: Text cleaning and lemmatization
stop_words = set(stopwords.words('english'))

In [90]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = ' '.join([word for word in nltk.word_tokenize(text) if word.isalpha()])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['processed_text'] = df['Sentence'].apply(preprocess_text)
df['processed_text']

0       geosolutions technology leverage benefon gps s...
1                             esi low bk real possibility
2       last quarter componenta net sale doubled perio...
3       according chamber commerce major construction ...
4       swedish buyout firm sold remaining percent sta...
                              ...                        
5837    rising cost forced packaging producer huhtamak...
5838    nordic walking first used summer training meth...
5839    according shipping company viking line eu deci...
5840    building home improvement trade sale decreased...
5841    helsinki afx kci konecranes said order four ho...
Name: processed_text, Length: 5842, dtype: object

In [91]:
df

Unnamed: 0,Sentence,Sentiment,processed_text
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi low bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,last quarter componenta net sale doubled perio...
3,According to the Finnish-Russian Chamber of Co...,neutral,according chamber commerce major construction ...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining percent sta...
...,...,...,...
5837,RISING costs have forced packaging producer Hu...,negative,rising cost forced packaging producer huhtamak...
5838,Nordic Walking was first used as a summer trai...,neutral,nordic walking first used summer training meth...
5839,"According shipping company Viking Line , the E...",neutral,according shipping company viking line eu deci...
5840,"In the building and home improvement trade , s...",neutral,building home improvement trade sale decreased...


In [92]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [93]:
# Handling imbalanced datasets: Upsample the minority class
df_neutral = train_df[train_df['Sentiment'] == 'neutral']
df_positive = train_df[train_df['Sentiment'] == 'positive']
df_negative = train_df[train_df['Sentiment'] == 'negative']

df_positive_upsampled = resample(df_positive, replace=True, n_samples=len(df_neutral), random_state=42)  # create additional copies of negative samples to balance the positive.
df_negative_upsampled = resample(df_negative, replace=True, n_samples=len(df_neutral), random_state=42)  
train_df_upsampled = pd.concat([df_neutral, df_positive_upsampled, df_negative_upsampled])

In [94]:
len(df_neutral)

2508

In [95]:
len(df_positive_upsampled)

2508

In [96]:
len(df_negative_upsampled)

2508

In [97]:
train_df_upsampled

Unnamed: 0,Sentence,Sentiment,processed_text
1647,The floor area of the Yliopistonrinne project ...,neutral,floor area yliopistonrinne project sq sq ft bu...
1669,"no compensation for its news , opinions or dis...",neutral,compensation news opinion distribution
4577,This includes a EUR 39.5 mn change in the fair...,neutral,includes eur mn change fair value investment p...
3116,Product coverage : baked goods ; biscuits ; br...,neutral,product coverage baked good biscuit breakfast ...
2764,The investment will be worth approximately EUR...,neutral,investment worth approximately eur
...,...,...,...
153,"In January-June 2010 , diluted loss per share ...",negative,diluted loss per share stood versus first half
1165,"During the strike , Finnair estimates to incur...",negative,strike finnair estimate incur net loss per day
3214,$QCOR a little pullback is fine but if this er...,negative,qcor little pullback fine era today gain belie...
1987,"ADPnews - Jul 17 , 2009 - Finland-based steel ...",negative,adpnews jul steel maker rautaruukki oyj ruukki...


In [98]:
word2vec_model = Word2Vec.load('word2vec_model.model')

In [99]:
class MeanEmbeddingTransformer(TransformerMixin):
    def __init__(self, word2vec):
        if isinstance(word2vec, Word2Vec):
            self.word2vec = word2vec.wv
        else:
            self.word2vec = word2vec
        self.dim = self.word2vec.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[word] for word in words if word in self.word2vec.key_to_index]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [100]:
def train_and_evaluate(model):
    # Train the model
    model.fit(train_df_upsampled['processed_text'], train_df_upsampled['Sentiment'])

    # Make predictions on the test set
    predictions = model.predict(test_df['processed_text'])

    # Evaluate the performance
    accuracy = metrics.accuracy_score(test_df['Sentiment'], predictions)
    precision = metrics.precision_score(test_df['Sentiment'], predictions, average='weighted')
    recall = metrics.recall_score(test_df['Sentiment'], predictions, average='weighted')
    f1_score = metrics.f1_score(test_df['Sentiment'], predictions, average='weighted')

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1_score:.2f}")

    classification_report = metrics.classification_report(test_df['Sentiment'], predictions)
    print("Classification Report:\n", classification_report)

Logistic Regression

In [120]:
# Create a text classification pipeline with both TfidfVectorizer and MeanEmbeddingTransformer
model = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),
    ])),
    ('classifier', LogisticRegression(max_iter=1000)),
])
train_and_evaluate(model)

Accuracy: 0.69
Precision: 0.71
Recall: 0.69
F1 Score: 0.70
Classification Report:
               precision    recall  f1-score   support

    negative       0.40      0.51      0.45       175
     neutral       0.77      0.75      0.76       622
    positive       0.75      0.67      0.71       372

    accuracy                           0.69      1169
   macro avg       0.64      0.65      0.64      1169
weighted avg       0.71      0.69      0.70      1169



In [101]:
# Create a text classification pipeline with both TfidfVectorizer and MeanEmbeddingTransformer
model = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),
        ('word_embedding', MeanEmbeddingTransformer(word2vec_model)),
    ])),
    ('classifier', LogisticRegression(max_iter=1000)),
])

In [102]:
train_and_evaluate(model)

Accuracy: 0.69
Precision: 0.70
Recall: 0.69
F1 Score: 0.69
Classification Report:
               precision    recall  f1-score   support

    negative       0.38      0.47      0.42       175
     neutral       0.77      0.75      0.76       622
    positive       0.74      0.69      0.71       372

    accuracy                           0.69      1169
   macro avg       0.63      0.64      0.63      1169
weighted avg       0.70      0.69      0.69      1169



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest

In [103]:
model = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),
        ('word_embedding', MeanEmbeddingTransformer(word2vec_model)),
    ])),
    ('classifier', RandomForestClassifier(random_state=42)),
])

In [104]:
train_and_evaluate(model)

Accuracy: 0.54
Precision: 0.51
Recall: 0.54
F1 Score: 0.50
Classification Report:
               precision    recall  f1-score   support

    negative       0.11      0.06      0.08       175
     neutral       0.57      0.81      0.67       622
    positive       0.60      0.31      0.41       372

    accuracy                           0.54      1169
   macro avg       0.43      0.39      0.39      1169
weighted avg       0.51      0.54      0.50      1169



Gradient Boosting

In [115]:
model = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),
    ])),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])

In [116]:
train_and_evaluate(model)

Accuracy: 0.66
Precision: 0.67
Recall: 0.66
F1 Score: 0.66
Classification Report:
               precision    recall  f1-score   support

    negative       0.37      0.46      0.41       175
     neutral       0.72      0.78      0.75       622
    positive       0.74      0.55      0.63       372

    accuracy                           0.66      1169
   macro avg       0.61      0.60      0.60      1169
weighted avg       0.67      0.66      0.66      1169



MultinomialNB

In [106]:
# Create a text classification pipeline
model = make_pipeline(CountVectorizer(), MultinomialNB())  # CountVectorizer for feature extraction and MultinomialNB (Naive Bayes) as the classifier.

In [107]:
train_and_evaluate(model)

Accuracy: 0.65
Precision: 0.70
Recall: 0.65
F1 Score: 0.67
Classification Report:
               precision    recall  f1-score   support

    negative       0.34      0.61      0.43       175
     neutral       0.80      0.70      0.75       622
    positive       0.70      0.58      0.64       372

    accuracy                           0.65      1169
   macro avg       0.61      0.63      0.61      1169
weighted avg       0.70      0.65      0.67      1169



SVM

In [108]:
# Create a text classification pipeline with both TfidfVectorizer and MeanEmbeddingTransformer
model = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),
        ('word_embedding', MeanEmbeddingTransformer(word2vec_model)),
    ])),
    ('classifier', SVC(kernel='linear')),
])

CountVectorizer simply counts the occurrences of terms in a document.  
TfidfVectorizer takes into account both the term frequency in the document and the inverse document frequency in the entire corpus. 

In [109]:
train_and_evaluate(model)

Accuracy: 0.68
Precision: 0.70
Recall: 0.68
F1 Score: 0.69
Classification Report:
               precision    recall  f1-score   support

    negative       0.37      0.49      0.42       175
     neutral       0.76      0.74      0.75       622
    positive       0.75      0.67      0.71       372

    accuracy                           0.68      1169
   macro avg       0.63      0.63      0.63      1169
weighted avg       0.70      0.68      0.69      1169



In [110]:
predictions

array(['neutral', 'positive', 'negative', ..., 'positive', 'positive',
       'neutral'], dtype='<U8')

In [111]:
# Test with a new example
new_example = ["This feels positive."]
new_example_processed = preprocess_text(new_example[0])
predicted_label = model.predict([new_example_processed])
print(f"Predicted Label: {predicted_label[0]}")

Predicted Label: positive
