#### Feature extraction : TF-IDF 
#### Sentiment Analysis Model : SVM MODEL 

In [3]:
import pandas as pd
import tkinter as tk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer

In [4]:
import tkinter as tk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import emoji
import re
# Load the preprocessed DataFrame
df = pd.read_csv('preprocessed_output.csv')

# Tokenization and lemmatization
lemmatizer = WordNetLemmatizer()
# df['lemmatized_tokens'] = df['body'].apply(lambda x: [lemmatizer.lemmatize(token) for token in word_tokenize(x)])

tokenizer = ToktokTokenizer()
df['lemmatized_tokens'] = df['body'].apply(lambda x: [lemmatizer.lemmatize(token) for token in tokenizer.tokenize(x)])

# Convert tokens to text for TF-IDF
df['text'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))

# TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Normalize the TF-IDF matrix
tfidf_matrix_normalized = normalize(tfidf_matrix)

In [5]:
# Split the data into training and testing sets
y = df['predicted_sentiment']
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_normalized, y, test_size=0.2, random_state=42)

In [6]:
# Initialize and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the training set
y_train_pred = svm_model.predict(X_train)

# Evaluate the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = svm_model.predict(X_test)

# Evaluate the testing accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
# Calculate other metrics
classification_rep = classification_report(y_test, y_test_pred)

# Print the training and testing accuracies along with other metrics
print("Feature Extraction: TF-IDF")
print("Model: SVM")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
print("\nClassification Report:\n", classification_rep)

Feature Extraction: TF-IDF
Model: SVM
Training Accuracy: 0.8736880202678249
Testing Accuracy: 0.6541244573082489

Classification Report:
               precision    recall  f1-score   support

         NEG       0.67      0.72      0.69       663
         NEU       0.63      0.67      0.65       585
         POS       0.63      0.29      0.40       134

    accuracy                           0.65      1382
   macro avg       0.65      0.56      0.58      1382
weighted avg       0.65      0.65      0.65      1382



In [8]:
# Create the main window
root = tk.Tk()
root.title("Project Group-10- Sentiment Analysis Tool")

# Create a text box with 3 rows
post = tk.Text(root, wrap="word", width=50, height=3, font=('Arial', 12))
post.grid(row=0, column=0, padx=10, pady=10)

# Display feature extractor and model name information
feature_info_label = tk.Label(root, text=f"Feature Extractor: TF-IDF\nModel: SVM", font=('Arial', 12))
feature_info_label.grid(row=0, column=1, padx=10, pady=10)

# Add a label to display the tokens
tokens_label = tk.Label(root, text="", font=('Arial', 12))
tokens_label.grid(row=3, column=0, pady=10)

# Create a button for sentiment analysis
analyze_button = tk.Button(root, text="Analyze Sentiment", command=lambda: analyze_sentiment(svm_model, tfidf_vectorizer, post), font=('Arial', 12))
analyze_button.grid(row=1, column=0, pady=10)

# Create a label to display the sentiment result
result_label = tk.Label(root, text="", font=('Arial', 14))
result_label.grid(row=2, column=0, pady=10)

# Emoticon to emoji mapping
emoticon_to_emoji = {
    ':)': '😊', ':D': '😃', ':]': '😃', ':(': '😞', ':/': '😕', ':|': '😐',
    ':-)': '😊', ':-D': '😃', ';)': '😉', ':\'(': '😢', ':-/': '😕', ':-|': '😐',
    ':-P': '😜', ':-O': '😲', ':O': '😲', ':*': '😘', '<3': '❤️', ':-$': '🤑',
    ':-!': '😤', ':-(': '😞', ':-[': '😟', ':-@': '😠', ':-#': '🤐', ':-*': '😗',
    ':^)': '😊', '8-)': '😎', '>:(': '😡', ':-\\': '😕', ':-/': '😕', ':-&': '😤',
    'O:-)': '😇', ':-X': '🤐', ':-D': '😃', '=)': '😊', '=D': '😃', '<3': '❤️',
    'XD': '😆', ':-D': '😃', '=D': '😃', ':-]': '😃', ':->': '😃', ':-o': '😲',
    ';-)': '😉', '(-:': '😃', '(-_-)': '😑', ':-]': '😃', ':->': '😃', '<3': '❤️',
    '=]': '😃', ':3': '😺', ':c)': '😺', ':>': '😃', '=]': '😃', ':}': '😃',
    '8-)': '😎', 'B-)': '😎', '8-D': '😃', '>:D': '😡', 'X-D': '😆', 'x-D': '😆',
    'X)': '😆', 'x)': '😆', 'X3': '😺', 'x3': '😺', ':-Q': '😖', '=p': '😛',
    ':-j': '😒', ':-L': '😒', ':-)': '😊', ':-D': '😃', ':-(': '😞', ':-[': '😟',
    ':-@': '😠', ':-#': '🤐', ':-*': '😗', 'O:-)': '😇', ':-X': '🤐', ':-D': '😃',
    '=)': '😊', '=D': '😃', '<3': '❤️', 'XD': '😆', ':-D': '😃', '=D': '😃',
    ':-]': '😃', ':->': '😃', ':-o': '😲', ';-)': '😉', '(-:': '😃', '(-_-)': '😑',
    ':-]': '😃', ':->': '😃', '<3': '❤️', '=]': '😃', ':3': '😺', ':c)': '😺',
    ':>': '😃', '=]': '😃', ':}': '😃', '8-)': '😎', 'B-)': '😎', '8-D': '😃',
    '>:D': '😡', 'X-D': '😆', 'x-D': '😆', 'X)': '😆', 'x)': '😆', 'X3': '😺',
    'x3': '😺', ':-Q': '😖', '=p': '😛', ':-j': '😒', ':-L': '😒', ':-|': '😐',
    '=\\': '😕', ':-&': '😤', 'O:-)': '😇', ':-X': '🤐', ':-D': '😃', '=)': '😊',
    '=D': '😃', '<3': '❤️', 'XD': '😆', ':-D': '😃', '=D': '😃', ':-]': '😃',
    ':->': '😃', ':-o': '😲', ';-)': '😉'}

sentiment_slang_dict = {
    'omg':'Oh my God',
    'beauty, eh': 'excellent, right?',
    'all smiles': 'very happy',
    'pumped': 'excited',
    'over the moon': 'extremely happy',
    'hot under the collar': 'angry',
    'pissed off': 'very angry',
    'bent out of shape': 'upset',
    'seeing red': 'becoming very angry',
    'rough day, eh?': 'difficult day, right?',
    'not impressed': 'unimpressed',
    'down in the dumps': 'feeling sad',
    'going through a rough patch': 'experiencing a difficult time',
    'fed up': 'frustrated or annoyed',
}

def replace_emoticons_with_emojis(text):
    for emoticon, emoji in emoticon_to_emoji.items():
        text = text.replace(emoticon, emoji)
    return text

def tokenize_and_lemmatize(text):
    # Tokenize and lemmatize words
    tokens = []
    for word in text.split():
        # Handle emojis using the emoji library
        tokens.extend(emoji.demojize(word).split())
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the lemmatized tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text, lemmatized_tokens

def analyze_sentiment(model, tfidf_vectorizer, post,):
    post_text = post.get("1.0", "end-1c")  # Get text from the Text widget
    
    # Replace emoticons with emojis
    post_text = replace_emoticons_with_emojis(post_text)

    # Add a space between emojis
    post_text = re.sub(r'(:[^\s:]+:)', r'\1 ', post_text)

    # Apply demojize directly to post_text
    post_text = emoji.demojize(post_text)
    
    post_text = post_text.replace(':', ' ').replace('_', ' ').replace('!', '').replace('#',' ')

    # Apply sentiment slang dictionary
    post_text = ' '.join([sentiment_slang_dict[token] if token in sentiment_slang_dict else token for token in post_text.split()])


    # Tokenize and lemmatize the text (including emojis)
    processed_text, tokens = tokenize_and_lemmatize(post_text)
    
    # Use the provided TF-IDF vectorizer for feature extraction
    post_tfidf = tfidf_vectorizer.transform([processed_text])

    # Normalize the TF-IDF matrix
    post_tfidf_normalized = normalize(post_tfidf)

    # Use the SVM model for prediction
    prediction = model.predict(post_tfidf_normalized)

    result_label.config(text=f"Sentiment: {prediction[0]}")
    tokens_label.config(text=f"Tokens: {', '.join(tokens)}")

# Run the Tkinter event loop
root.mainloop()
