<a href="https://colab.research.google.com/github/Emdya/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt') #Downloading necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
text = "I love this movie!"
tokens = word_tokenize(text)
print(tokens)

['I', 'love', 'this', 'movie', '!']


In [4]:
lowercase_tokens = [token.lower() for token in tokens]
print(lowercase_tokens)

['i', 'love', 'this', 'movie', '!']


In [5]:
stopwords = set(stopwords.words('english'))
filtered_tokens = [token for token in lowercase_tokens if token not in stopwords]
print(filtered_tokens)

['love', 'movie', '!']


In [7]:
import re
cleaned_tokens = [re.sub(r'[^\w\s]','',token)for token in filtered_tokens]
print(cleaned_tokens)

['love', 'movie', '']


In [8]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token)for token in cleaned_tokens]
print(stemmed_tokens)

['love', 'movi', '']


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
corpus = ["I love this movie!", "This movie is great.", "I don't like this movie."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['don' 'great' 'is' 'like' 'love' 'movie' 'this']
[[0 0 0 0 1 1 1]
 [0 1 1 0 0 1 1]
 [1 0 0 1 0 1 1]]


In [11]:
corpus = ["I love this movie!","This movie is great.","I don't like this movie."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['don' 'great' 'is' 'like' 'love' 'movie' 'this']
[[0.         0.         0.         0.         0.76749457 0.45329466
  0.45329466]
 [0.         0.6088451  0.6088451  0.         0.         0.35959372
  0.35959372]
 [0.6088451  0.         0.         0.6088451  0.         0.35959372
  0.35959372]]


# Financial Sentiment Analysis

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

#Load the dataset from CSV
data = pd.read_csv('data.csv')
X = data['Sentence']
y = data['Sentiment']

In [13]:
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#Feature extraction
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [15]:
#Train the SVM Classifier
clf = svm.SVC()
clf.fit(X_train_features, y_train)

In [16]:
#Make predictions
y_pred = clf.predict(X_test_features)

In [17]:
#Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6920444824636441


In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

In [19]:
#Split into the features into features (X) and labels (y)
sentences = data['Sentence'].values
labels = data['Sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [20]:
#Tokenize the sentences
tokenizer = Tokenizer(num_words = 5000, oov_token = '<OOV>')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [22]:
#Pad the Sequences
padded_sequences = pad_sequences(sequences, padding = 'post')
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels,test_size = 0.2, random_state = 42)
#Define the neural network architecture
vocab_size = len(tokenizer.word_index) + 1 #Added +1 because of reserved 0 index for padding
embedding_dim = 100 #You can choose any size for the embedding_dim
max_length = len(max(sequences,key=len))

In [29]:
model = keras.Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_length))
model.add(LSTM(units = 128))
model.add(Dense(units = 1, activation = 'sigmoid'))



In [26]:
#Compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [27]:
#Train the model
model.fit(X_train, y_train, epochs = 10, batch_size = 32)

Epoch 1/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 138ms/step - accuracy: 0.6925 - loss: 0.6281
Epoch 2/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 139ms/step - accuracy: 0.6854 - loss: 0.6261
Epoch 3/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 136ms/step - accuracy: 0.6810 - loss: 0.6273
Epoch 4/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 127ms/step - accuracy: 0.6907 - loss: 0.6191
Epoch 5/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 136ms/step - accuracy: 0.6817 - loss: 0.6277
Epoch 6/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 127ms/step - accuracy: 0.6761 - loss: 0.6303
Epoch 7/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 128ms/step - accuracy: 0.6756 - loss: 0.6309
Epoch 8/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 134ms/step - accuracy: 0.6920 - loss: 0.6206
Epoch 9/10
[1m1

<keras.src.callbacks.history.History at 0x79a430e59650>

In [28]:
#Evaluate the Model
loss,accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy: ", accuracy)


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.6775 - loss: 0.6289
Loss: 0.625778079032898
Accuracy:  0.6817793250083923


# Financial Sentiment Analysis Part 2:

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [33]:
#Load the dataset from CSV
data = pd.read_csv('data.csv')

In [34]:
#Split the dataset into features (X) and labels (y)
x = data['Sentence']
y = data['Sentiment']

In [35]:
#Convert labels to numerical values
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [36]:
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [37]:
#Feature extraction
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [38]:
#Train the model
model = SVC()
model.fit(X_train_features, y_train)

In [39]:
#Make Predictions
y_pred = model.predict(X_test_features)

In [40]:
#Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='weighted')
recall = recall_score(y_test, y_pred,average='weighted')
f1 = f1_score(y_test, y_pred,average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

In [41]:
print("Accuracy:",accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1 Score:",f1)
print("Confusion Matrix:")
print(confusion_matrix)

Accuracy: 0.6920444824636441
Precision: 0.6579416519780992
Recall: 0.6920444824636441
F1 Score: 0.6506884603237336
Confusion Matrix:
<function confusion_matrix at 0x79a4978fd8a0>


In [44]:
#Hyperparameter tuning using GridSearchCV
parameters = {'C': [0.1,1,10],'kernel': ['linear','rbf']}
grid_search = GridSearchCV(model,parameters,cv=5)
grid_search.fit(X_train_features,y_train)

In [45]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [46]:
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Best Score: 0.6839242405157508
