#### Feature extraction : BERT
#### Sentiment Analysis Model : SVM MODEL 

In [2]:
import tkinter as tk
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from transformers import BertTokenizer, BertModel

In [3]:

# Load the preprocessed DataFrame
df = pd.read_csv('preprocessed_output.csv')

# BERT features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Set pad_token for the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def extract_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [4]:
# Convert the list of NumPy arrays to PyTorch tensors
bert_embeddings = [torch.from_numpy(embedding) for embedding in df['body'].apply(extract_bert_embeddings).tolist()]

In [5]:
# Convert the list of PyTorch tensors to a PyTorch tensor
bert_embeddings_tensor = torch.stack(bert_embeddings)

In [6]:
# Normalize the BERT matrix using PyTorch operations
bert_matrix_normalized = bert_embeddings_tensor / bert_embeddings_tensor.max()

# Display the shape of the BERT matrix
print("BERT Matrix Shape:", bert_matrix_normalized.shape)

BERT Matrix Shape: torch.Size([6908, 768])


In [7]:
# Split the data into training and testing sets
y = df['predicted_sentiment']
X_train, X_test, y_train, y_test = train_test_split(bert_matrix_normalized, y, test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the training set
y_train_pred = svm_model.predict(X_train)

# Evaluate the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = svm_model.predict(X_test)

# Evaluate the testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
# Calculate other metrics
classification_rep = classification_report(y_test, y_test_pred)

# Print the training and testing accuracies along with other metrics
print("Feature Extraction: BERT")
print("Model: SVM")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
print("\nClassification Report:\n", classification_rep)

Feature Extraction: BERT
Model: SVM
Training Accuracy: 0.8519724936663048
Testing Accuracy: 0.7337192474674384

Classification Report:
               precision    recall  f1-score   support

         NEG       0.77      0.78      0.78       663
         NEU       0.72      0.73      0.73       585
         POS       0.58      0.49      0.53       134

    accuracy                           0.73      1382
   macro avg       0.69      0.67      0.68      1382
weighted avg       0.73      0.73      0.73      1382



In [9]:
# Create the main window
root = tk.Tk()
root.title("Project Group-10- Sentiment Analysis Tool")

# Create a text box with 3 rows
post = tk.Text(root, wrap="word", width=50, height=3, font=('Arial', 12))
post.grid(row=0, column=0, padx=10, pady=10)

# Display feature extractor and model name information
feature_info_label = tk.Label(root, text=f"Feature Extractor: BERT\nModel: SVM", font=('Arial', 12))
feature_info_label.grid(row=0, column=1, padx=10, pady=10)

# Create a button for sentiment analysis
analyze_button = tk.Button(root, text="Analyze Sentiment", command=lambda: analyze_sentiment(svm_model, post), font=('Arial', 12))
analyze_button.grid(row=1, column=0, pady=10)

# Create a label to display the sentiment result
result_label = tk.Label(root, text="", font=('Arial', 14))
result_label.grid(row=2, column=0, pady=10)

def analyze_sentiment(model, post):
    post_text = post.get("1.0", "end-1c")  # Get text from the Text widget
    
    # Use the provided SVM model for prediction
    inputs = tokenizer(post_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    post_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    prediction = model.predict([post_embedding])

    result_label.config(text=f"Sentiment: {prediction[0]}")

# Run the Tkinter event loop
root.mainloop()