# **Financial Text Sentiment Analysis**

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

# **Importing Dataset**

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSE3038/Project/Sentiment_Analysis.csv')
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


# **Exploratory Data Analysis**

In [3]:
data.shape

(5842, 2)

In [4]:
data.columns

Index(['Sentence', 'Sentiment'], dtype='object')

In [5]:
data.dtypes

Sentence     object
Sentiment    object
dtype: object

In [6]:
data.count()

Sentence     5842
Sentiment    5842
dtype: int64

In [7]:
data = data.drop_duplicates()
data.count()

Sentence     5836
Sentiment    5836
dtype: int64

In [8]:
data.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

In [9]:
data.describe()

Unnamed: 0,Sentence,Sentiment
count,5836,5836
unique,5322,3
top,Managing Director 's comments : `` Net sales f...,neutral
freq,2,3124


In [10]:
print(data.Sentiment.nunique())
print(data.Sentiment.unique())

3
['positive' 'negative' 'neutral']


In [11]:
data.Sentiment.value_counts()

neutral     3124
positive    1852
negative     860
Name: Sentiment, dtype: int64

# **Data Preprocessing**

In [12]:
le = preprocessing.LabelEncoder()
data['Sentiment'] = le.fit_transform(data['Sentiment'])
data['Sentiment']

0       2
1       0
2       2
3       1
4       1
       ..
5837    0
5838    1
5839    1
5840    1
5841    2
Name: Sentiment, Length: 5836, dtype: int64

In [None]:
#!pip install gensim

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
#from sklearn.feature_extraction.text import TfidfVectorizer
#import gensim

In [14]:
#token = RegexpTokenizer(r'[a-zA-Z0-9]+')
#cv = CountVectorizer(stop_words='english',ngram_range =(1,1),tokenizer = token.tokenize)
#X = cv.fit_transform(data['Sentence'])
#y = data['Sentiment']
#bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=2000,stop_words='english')
#X  = bow_vectorizer.fit_transform(data['Sentence'])
#X.shape

In [15]:
from string import punctuation, digits
import re
replacements = {"they're": "they are","she's":"she is","he's":"he is","doesn't":"does not","it's": "it is","that's": "that is",
                "we're": "we are","you're":"you are","i'm":"i am","don't":"do not","what's":"what is"}
def preprocess(text):
    text = re.compile(r'https?://\S+|www\.\S+').sub('', text)
    text = text.lower()
    text = re.sub(r'\d','',text)
    text = re.compile(r'\s+').sub(' ',text)
    for old, new in replacements.items():
        text = text.replace(old,new)
    text=re.compile(r'[^\w\s]').sub('',text)
    return text

In [16]:
data['Sentence'] = data['Sentence'].apply(preprocess)
data['Sentence']

0       the geosolutions technology will leverage bene...
1             esi on lows down  to  bk a real possibility
2       for the last quarter of  componenta s net sale...
3       according to the finnishrussian chamber of com...
4       the swedish buyout firm has sold its remaining...
                              ...                        
5837    rising costs have forced packaging producer hu...
5838    nordic walking was first used as a summer trai...
5839    according shipping company viking line  the eu...
5840    in the building and home improvement trade  sa...
5841    helsinki afx  kci konecranes said it has won a...
Name: Sentence, Length: 5836, dtype: object

In [17]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
bow = CountVectorizer(stop_words='english',ngram_range =(1,1),tokenizer = token.tokenize)
bow_data = pd.DataFrame(bow.fit_transform(data['Sentence']).todense())
bow_data.columns = sorted(bow.vocabulary_)
bow_data

Unnamed: 0,aa,aal,aaland,aalto,aaltonen,aapl,aaron,aava,aazhang,ab,...,znga,zoltan,zone,zoo,zs,zsl,zte,zu,zurich,zxx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
data.columns

Index(['Sentence', 'Sentiment'], dtype='object')

In [19]:
X = bow_data
y = data['Sentiment']

In [20]:
#Creating a function for printing Accuracy Score, Confusion Matrix, Classification Report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


def print_score(clf,X_train,y_train,X_test,y_test,train=True):
  if train:
    pred = clf.predict(X_train)
    clf_report = pd.DataFrame(classification_report(y_train,pred,output_dict=True))
    print("Train Result:\n=======================================================")
    print(f"Accuracy Score: {accuracy_score(y_train,pred)*100:.2f}%")
    print("_______________________________________________________")
    print(f"Classification Report:\n{clf_report}")
    print("_______________________________________________________")
    print(f"Confusion Matrix: \n{confusion_matrix(y_train,pred)}\n")

  elif train==False:
    pred = clf.predict(X_test)
    clf_report = pd.DataFrame(classification_report(y_test,pred,output_dict=True))
    print("Test result:\n========================================================")
    print(f"Accuracy Score: {accuracy_score(y_test,pred)*100:.2f}%")
    print("_______________________________________________________")
    print(f"Classification Report:\n {clf_report}")
    print("_______________________________________________________")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test,pred)}\n")

# **Splitting Dataset into Train and Test Datasets**

In [21]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [22]:
X_train.shape,X_test.shape

((4668, 10282), (1168, 10282))

# **Multinomial Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)

print_score(mnb, X_train, y_train, X_test, y_test, train=True)
print_score(mnb, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 84.77%
_______________________________________________________
Classification Report:
                    0            1            2  accuracy    macro avg  \
precision    0.655667     0.871681     0.897643  0.847686     0.808331   
recall       0.651926     0.881611     0.883366  0.847686     0.805634   
f1-score     0.653791     0.876618     0.890448  0.847686     0.806952   
support    701.000000  2458.000000  1509.000000  0.847686  4668.000000   

           weighted avg  
precision      0.847635  
recall         0.847686  
f1-score       0.847626  
support     4668.000000  
_______________________________________________________
Confusion Matrix: 
[[ 457  194   50]
 [ 189 2167  102]
 [  51  125 1333]]

Test result:
Accuracy Score: 68.84%
_______________________________________________________
Classification Report:
                     0           1           2  accuracy    macro avg  \
precision    0.338983    0.770393    0.711246  0.688356     0.60

In [None]:
#def MNB_predictor():
  #text = input("Please enter any text related to Finance Market: ")
  #sentence = np.array([text])
  #sentence_token  = cv.transform(sentence)
  #print(MNB.predict(sentence_token))
#MNB_predictor()

# **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train,y_train)
print_score(lr, X_train, y_train, X_test, y_test, train=True)
print_score(lr, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 90.06%
_______________________________________________________
Classification Report:
                    0            1            2  accuracy    macro avg  \
precision    0.798535     0.882620     0.970290    0.9006     0.883815   
recall       0.621969     0.948332     0.952286    0.9006     0.840862   
f1-score     0.699278     0.914297     0.961204    0.9006     0.858260   
support    701.000000  2458.000000  1509.000000    0.9006  4668.000000   

           weighted avg  
precision      0.898334  
recall         0.900600  
f1-score       0.897171  
support     4668.000000  
_______________________________________________________
Confusion Matrix: 
[[ 436  247   18]
 [ 101 2331   26]
 [   9   63 1437]]

Test result:
Accuracy Score: 67.81%
_______________________________________________________
Classification Report:
                     0           1           2  accuracy    macro avg  \
precision    0.230159    0.730874    0.735484  0.678082     0.56

# **Decision Tree Classification**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
print_score(dt, X_train, y_train, X_test, y_test, train=True)
print_score(dt, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 92.67%
_______________________________________________________
Classification Report:
                    0            1            2  accuracy    macro avg  \
precision    0.675024     0.999528     0.997353  0.926735     0.890635   
recall       0.998573     0.862083     0.998675  0.926735     0.953110   
f1-score     0.805524     0.925732     0.998013  0.926735     0.909756   
support    701.000000  2458.000000  1509.000000  0.926735  4668.000000   

           weighted avg  
precision      0.950094  
recall         0.926735  
f1-score       0.931046  
support     4668.000000  
_______________________________________________________
Confusion Matrix: 
[[ 700    0    1]
 [ 336 2119    3]
 [   1    1 1507]]

Test result:
Accuracy Score: 60.27%
_______________________________________________________
Classification Report:
                     0           1           2  accuracy    macro avg  \
precision    0.161290    0.708202    0.646552   0.60274     0.50

# **Support Vector Classification**

In [None]:
from sklearn.svm import SVC
svc = SVC(probability=True,random_state=42)
svc.fit(X_train,y_train)
print_score(svc, X_train, y_train, X_test, y_test, train=True)
print_score(svc, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 86.72%
_______________________________________________________
Classification Report:
                    0            1            2  accuracy    macro avg  \
precision    0.914498     0.816779     0.964059  0.867181     0.898445   
recall       0.350927     0.990236     0.906561  0.867181     0.749241   
f1-score     0.507216     0.895182     0.934426  0.867181     0.778942   
support    701.000000  2458.000000  1509.000000  0.867181  4668.000000   

           weighted avg  
precision      0.879064  
recall         0.867181  
f1-score       0.849607  
support     4668.000000  
_______________________________________________________
Confusion Matrix: 
[[ 246  416   39]
 [  12 2434   12]
 [  11  130 1368]]

Test result:
Accuracy Score: 67.38%
_______________________________________________________
Classification Report:
                     0           1           2  accuracy    macro avg  \
precision    0.229730    0.684028    0.778261  0.673801     0.56

# **Random Forest Classification**

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=42,)
rf.fit(X_train,y_train)
print_score(rf, X_train, y_train, X_test, y_test, train=True)
print_score(rf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 92.67%
_______________________________________________________
Classification Report:
                    0            1            2  accuracy    macro avg  \
precision    0.854086     0.900795     0.996695  0.926735     0.917192   
recall       0.626248     0.967860     0.999337  0.926735     0.864482   
f1-score     0.722634     0.933124     0.998015  0.926735     0.884591   
support    701.000000  2458.000000  1509.000000  0.926735  4668.000000   

           weighted avg  
precision      0.924782  
recall         0.926735  
f1-score       0.922491  
support     4668.000000  
_______________________________________________________
Confusion Matrix: 
[[ 439  261    1]
 [  75 2379    4]
 [   0    1 1508]]

Test result:
Accuracy Score: 65.15%
_______________________________________________________
Classification Report:
                     0           1           2  accuracy    macro avg  \
precision    0.166667    0.694301    0.755556  0.651541     0.53

In [None]:
def RF_predictor():
  text = input("Please enter any text related to Finance Market: ")
  sentence = np.array([text])
  sentence_token  = cv.transform(sentence)
  print(rf.predict(sentence_token))
RF_predictor()