In [59]:
import nltk
from nltk.corpus import stopwords

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import resample

In [60]:
# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
df = pd.read_csv('data/financial sentiment.csv', header=0)
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [62]:
# Create a count table for the 'Sentiment' column
sentiment_counts = df['Sentiment'].value_counts()
sentiment_counts

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64

In [63]:
# Preprocessing: Text cleaning and lemmatization
stop_words = set(stopwords.words('english'))

In [64]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = ' '.join([word for word in nltk.word_tokenize(text) if word.isalpha()])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['processed_text'] = df['Sentence'].apply(preprocess_text)
df['processed_text']

0       geosolutions technology leverage benefon gps s...
1                             esi low bk real possibility
2       last quarter componenta net sale doubled perio...
3       according chamber commerce major construction ...
4       swedish buyout firm sold remaining percent sta...
                              ...                        
5837    rising cost forced packaging producer huhtamak...
5838    nordic walking first used summer training meth...
5839    according shipping company viking line eu deci...
5840    building home improvement trade sale decreased...
5841    helsinki afx kci konecranes said order four ho...
Name: processed_text, Length: 5842, dtype: object

In [65]:
df

Unnamed: 0,Sentence,Sentiment,processed_text
0,The GeoSolutions technology will leverage Bene...,positive,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi low bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,last quarter componenta net sale doubled perio...
3,According to the Finnish-Russian Chamber of Co...,neutral,according chamber commerce major construction ...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining percent sta...
...,...,...,...
5837,RISING costs have forced packaging producer Hu...,negative,rising cost forced packaging producer huhtamak...
5838,Nordic Walking was first used as a summer trai...,neutral,nordic walking first used summer training meth...
5839,"According shipping company Viking Line , the E...",neutral,according shipping company viking line eu deci...
5840,"In the building and home improvement trade , s...",neutral,building home improvement trade sale decreased...


In [66]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [67]:
# Handling imbalanced datasets: Upsample the minority class
df_neutral = train_df[train_df['Sentiment'] == 'neutral']
df_positive = train_df[train_df['Sentiment'] == 'positive']
df_negative = train_df[train_df['Sentiment'] == 'negative']

df_positive_upsampled = resample(df_positive, replace=True, n_samples=len(df_neutral), random_state=42)  # create additional copies of negative samples to balance the positive.
df_negative_upsampled = resample(df_negative, replace=True, n_samples=len(df_neutral), random_state=42)  
train_df_upsampled = pd.concat([df_neutral, df_positive_upsampled, df_negative_upsampled])

In [68]:
df_negative_upsampled

Unnamed: 0,Sentence,Sentiment,processed_text
1263,"$AAPL AAPL: Gundlach Slams iPad mini, Sees Dow...",negative,aapl aapl gundlach slam ipad mini see downside...
316,"ADP News - Feb 13 , 2009 - Finnish retailer Ke...",negative,adp news feb finnish retailer kesko oyj hel ke...
1251,Finnish retail software developer Aldata Solut...,negative,finnish retail software developer aldata solut...
3418,Earnings per share ( EPS ) amounted to EUR1 .3...,negative,earnings per share eps amounted
3456,"The contracts of the employees , 96 of whom ar...",negative,contract employee worker ended march august
...,...,...,...
153,"In January-June 2010 , diluted loss per share ...",negative,diluted loss per share stood versus first half
1165,"During the strike , Finnair estimates to incur...",negative,strike finnair estimate incur net loss per day
3214,$QCOR a little pullback is fine but if this er...,negative,qcor little pullback fine era today gain belie...
1987,"ADPnews - Jul 17 , 2009 - Finland-based steel ...",negative,adpnews jul steel maker rautaruukki oyj ruukki...


In [70]:
# Create a text classification pipeline
model = make_pipeline(CountVectorizer(), MultinomialNB())  # CountVectorizer for feature extraction and MultinomialNB (Naive Bayes) as the classifier.

# Train the model
model.fit(train_df_upsampled['processed_text'], train_df_upsampled['Sentiment'])

# Make predictions on the test set
predictions = model.predict(test_df['processed_text'])

# Evaluate the performance
accuracy = metrics.accuracy_score(test_df['Sentiment'], predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.65


In [71]:
predictions

array(['neutral', 'positive', 'negative', ..., 'positive', 'positive',
       'neutral'], dtype='<U8')

In [72]:
# Test with a new example
new_example = ["This feels positive."]
new_example_processed = preprocess_text(new_example[0])
predicted_label = model.predict([new_example_processed])
print(f"Predicted Label: {predicted_label[0]}")

Predicted Label: positive
