#### Feature extraction : BAG OF WORDS 
#### Sentiment Analysis Model : SVM MODEL 

In [2]:
import tkinter as tk
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# Load the preprocessed DataFrame
df = pd.read_csv('preprocessed_output.csv')


# Tokenization and lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df['body'].apply(lambda x: [lemmatizer.lemmatize(token) for token in word_tokenize(x)])

# Extract features using Bag-of-Words
vectorizer_bow = CountVectorizer()
bow_matrix = vectorizer_bow.fit_transform(df['lemmatized_tokens'].apply(lambda x: ' '.join(x)))

if bow_matrix.getnnz() == 0:
    raise ValueError("Bag-of-Words matrix is empty; check your preprocessing steps.")

bow_matrix_normalized = normalize(bow_matrix)  # Normalize for consistent scale

# Display the shape of the Bag-of-Words matrix
print("Bag-of-Words Matrix Shape:", bow_matrix_normalized.shape)

# Split the data into training and testing sets
y = df['predicted_sentiment']
X_train, X_test, y_train, y_test = train_test_split(bow_matrix_normalized, y, test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

Bag-of-Words Matrix Shape: (6908, 7602)


In [4]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the training set
y_train_pred = svm_model.predict(X_train)

# Evaluate the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = svm_model.predict(X_test)

# Evaluate the testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
# Calculate other metrics
classification_rep = classification_report(y_test, y_test_pred)

# Print the training and testing accuracies along with other metrics
print("Feature Extraction: BOW")
print("Model: SVM")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
print("\nClassification Report:\n", classification_rep)

Feature Extraction: BOW
Model: SVM
Training Accuracy: 0.814694173000362
Testing Accuracy: 0.6490593342981187

Classification Report:
               precision    recall  f1-score   support

         NEG       0.66      0.74      0.70       663
         NEU       0.64      0.64      0.64       585
         POS       0.66      0.26      0.37       134

    accuracy                           0.65      1382
   macro avg       0.65      0.55      0.57      1382
weighted avg       0.65      0.65      0.64      1382



In [8]:
import tkinter as tk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import emoji
import re

# Download NLTK resources (run once)
# import nltk
# nltk.download('wordnet')

# Create a lemmatizer
lemmatizer = WordNetLemmatizer()


# Emoticon to emoji mapping
emoticon_to_emoji = {
    ':)': '😊', ':D': '😃', ':]': '😃', ':(': '😞', ':/': '😕', ':|': '😐',
    ':-)': '😊', ':-D': '😃', ';)': '😉', ':\'(': '😢', ':-/': '😕', ':-|': '😐',
    ':-P': '😜', ':-O': '😲', ':O': '😲', ':*': '😘', '<3': '❤️', ':-$': '🤑',
    ':-!': '😤', ':-(': '😞', ':-[': '😟', ':-@': '😠', ':-#': '🤐', ':-*': '😗',
    ':^)': '😊', '8-)': '😎', '>:(': '😡', ':-\\': '😕', ':-/': '😕', ':-&': '😤',
    'O:-)': '😇', ':-X': '🤐', ':-D': '😃', '=)': '😊', '=D': '😃', '<3': '❤️',
    'XD': '😆', ':-D': '😃', '=D': '😃', ':-]': '😃', ':->': '😃', ':-o': '😲',
    ';-)': '😉', '(-:': '😃', '(-_-)': '😑', ':-]': '😃', ':->': '😃', '<3': '❤️',
    '=]': '😃', ':3': '😺', ':c)': '😺', ':>': '😃', '=]': '😃', ':}': '😃',
    '8-)': '😎', 'B-)': '😎', '8-D': '😃', '>:D': '😡', 'X-D': '😆', 'x-D': '😆',
    'X)': '😆', 'x)': '😆', 'X3': '😺', 'x3': '😺', ':-Q': '😖', '=p': '😛',
    ':-j': '😒', ':-L': '😒', ':-)': '😊', ':-D': '😃', ':-(': '😞', ':-[': '😟',
    ':-@': '😠', ':-#': '🤐', ':-*': '😗', 'O:-)': '😇', ':-X': '🤐', ':-D': '😃',
    '=)': '😊', '=D': '😃', '<3': '❤️', 'XD': '😆', ':-D': '😃', '=D': '😃',
    ':-]': '😃', ':->': '😃', ':-o': '😲', ';-)': '😉', '(-:': '😃', '(-_-)': '😑',
    ':-]': '😃', ':->': '😃', '<3': '❤️', '=]': '😃', ':3': '😺', ':c)': '😺',
    ':>': '😃', '=]': '😃', ':}': '😃', '8-)': '😎', 'B-)': '😎', '8-D': '😃',
    '>:D': '😡', 'X-D': '😆', 'x-D': '😆', 'X)': '😆', 'x)': '😆', 'X3': '😺',
    'x3': '😺', ':-Q': '😖', '=p': '😛', ':-j': '😒', ':-L': '😒', ':-|': '😐',
    '=\\': '😕', ':-&': '😤', 'O:-)': '😇', ':-X': '🤐', ':-D': '😃', '=)': '😊',
    '=D': '😃', '<3': '❤️', 'XD': '😆', ':-D': '😃', '=D': '😃', ':-]': '😃',
    ':->': '😃', ':-o': '😲', ';-)': '😉'}

sentiment_slang_dict = {
    'omg':'Oh my God',
    'beauty, eh': 'excellent, right?',
    'all smiles': 'very happy',
    'pumped': 'excited',
    'over the moon': 'extremely happy',
    'hot under the collar': 'angry',
    'pissed off': 'very angry',
    'bent out of shape': 'upset',
    'seeing red': 'becoming very angry',
    'rough day, eh?': 'difficult day, right?',
    'not impressed': 'unimpressed',
    'down in the dumps': 'feeling sad',
    'going through a rough patch': 'experiencing a difficult time',
    'fed up': 'frustrated or annoyed',
}

def replace_emoticons_with_emojis(text):
    for emoticon, emoji in emoticon_to_emoji.items():
        text = text.replace(emoticon, emoji)
    return text

def analyze_sentiment(model, vectorizer, post, result_label, tokens_label):
    post_text = post.get("1.0", "end-1c")  # Get text from the Text widget
    
    
    # Replace emoticons with emojis
    post_text = replace_emoticons_with_emojis(post_text)

    # Add a space between emojis
    post_text = re.sub(r'(:[^\s:]+:)', r'\1 ', post_text)

    # Apply demojize directly to post_text
    post_text = emoji.demojize(post_text)
    
    post_text = post_text.replace(':', ' ').replace('_', ' ').replace('!', '').replace('#',' ')

     # Apply sentiment slang dictionary
    post_text = ' '.join([sentiment_slang_dict[token] if token in sentiment_slang_dict else token for token in post_text.split()])


    # Tokenize and lemmatize the text (including emojis)
    processed_text, tokens = tokenize_and_lemmatize(post_text)
    
    # Use the provided vectorizer to transform the processed text
    post_matrix = vectorizer.transform([processed_text])
    
    # Use the SVM model for prediction
    prediction = model.predict(post_matrix)

    result_label.config(text=f"Sentiment: {prediction[0]}")
    tokens_label.config(text=f"Tokens: {', '.join(tokens)}")

def tokenize_and_lemmatize(text):
    # Tokenize and lemmatize words
    tokens = []
    for word in text.split():
        # Handle emojis using the emoji library
        tokens.extend(emoji.demojize(word).split())
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the lemmatized tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text, lemmatized_tokens

root = tk.Tk()
root.title("Project Group-10- Sentiment Analysis Tool")

post = tk.Text(root, wrap="word", width=50, height=3, font=('Arial', 12))
post.grid(row=0, column=0, padx=10, pady=10)

feature_info_label = tk.Label(root, text=f"Feature Extractor: Bag-of-Words\nModel: SVM", font=('Arial', 12))
feature_info_label.grid(row=0, column=1, padx=10, pady=10)

result_label = tk.Label(root, text="", font=('Arial', 14))
result_label.grid(row=2, column=0, pady=10)

# Add a label to display the tokens
tokens_label = tk.Label(root, text="", font=('Arial', 12))
tokens_label.grid(row=3, column=0, pady=10)

analyze_button = tk.Button(root, text="Analyze Sentiment", command=lambda: analyze_sentiment(svm_model, vectorizer_bow, post, result_label, tokens_label), font=('Arial', 12))
analyze_button.grid(row=1, column=0, pady=10)

root.mainloop()


In [6]:
df

Unnamed: 0,body,score,comment_length,predicted_sentiment,tokens,stemmed_tokens,lemmatized_tokens
0,well there are thousands of international stud...,1,89,NEG,"['well', 'thousands', 'international', 'studen...","['well', 'thousand', 'intern', 'student', 'ill...","[well, there, are, thousand, of, international..."
1,the article said dude needed a translator lol ...,1,114,NEG,"['article', 'said', 'dude', 'needed', 'transla...","['articl', 'said', 'dude', 'need', 'translat',...","[the, article, said, dude, needed, a, translat..."
2,for those convicted of crimes thats good,1,42,POS,"['convicted', 'crimes', 'thats', 'good']","['convict', 'crime', 'that', 'good']","[for, those, convicted, of, crime, thats, good]"
3,good gotta bump up those rookie numbers,1,41,POS,"['good', 'got', 'ta', 'bump', 'rookie', 'numbe...","['good', 'got', 'ta', 'bump', 'rooki', 'number']","[good, got, ta, bump, up, those, rookie, number]"
4,good,1,5,POS,['good'],['good'],[good]
...,...,...,...,...,...,...,...
6903,i like you big fan,1,20,POS,"['like', 'big', 'fan']","['like', 'big', 'fan']","[i, like, you, big, fan]"
6904,sounds like date night around here frowning fa...,1,37,POS,"['sounds', 'like', 'date', 'night', 'around', ...","['sound', 'like', 'date', 'night', 'around', '...","[sound, like, date, night, around, here, frown..."
6905,apparently the answer was yes,1,29,NEU,"['apparently', 'answer', 'yes']","['appar', 'answer', 'ye']","[apparently, the, answer, wa, yes]"
6906,but with maaaaaassssksssss,1,26,NEU,['maaaaaassssksssss'],['maaaaaassssksssss'],"[but, with, maaaaaassssksssss]"
