In [32]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import gensim.downloader
from scipy.sparse import hstack

import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

import time

In [33]:
annotated_data = pd.read_csv('data/incomplete_annotations_data2.csv')

# annotated_data = full_data[full_data['Subjectivity'].notnull()]
# unannotated_data = full_data[full_data['Subjectivity'].isnull()]

In [34]:
annotated_data['Subjectivity'].value_counts()

Subjectivity
1.0    1560
0.0    1123
Name: count, dtype: int64

In [35]:
annotated_data['Polarity'].value_counts()

Polarity
1.0    876
0.0    684
Name: count, dtype: int64

# Inter-annotation Agreement

In [36]:
annotated_data['Source'].value_counts()

Source
Reddit                                                                                        1024
Twitter                                                                                       1016
TikTok                                                                                         206
Instagram                                                                                      203
https://www.youtube.com//watch?v=rYATRV6W4iA&pp=ygUaRm9yZXZlciAyMSBhbmltYWwgdGVzdGluZyA%3D       5
                                                                                              ... 
https://www.youtube.com//watch?v=DiuII_0KvDk&pp=ygURTWFuZ28gY29tcGFyaXNvbiA%3D                   1
https://www.youtube.com//watch?v=7CkRsMmrUcI&pp=ygUOVmVyc2FjZSB3YXN0ZSA%3D                       1
https://www.youtube.com//watch?v=jcjoIOwnxKA&pp=ygURRGlvciBjb25zdW1lcmlzbSA%3D                   1
https://www.youtube.com//watch?v=zBLSn-nLFhM&pp=ygUPTWFuZ28gb3BpbmlvbnMg                         1
htt

In [37]:
from sklearn.metrics import cohen_kappa_score

subjectivity_labels1 = annotated_data['Subjectivity']
subjectivity_labels2 = annotated_data['Subjectivity 2']

kappa_subjectivity = cohen_kappa_score(subjectivity_labels1, subjectivity_labels2)

print("Cohen's kappa between annotator 1 and annotator 2 for subjectivity labels:", kappa_subjectivity)

Cohen's kappa between annotator 1 and annotator 2 for subjectivity labels: 0.8777546356018551


In [38]:
polarity_labels1 = annotated_data['Polarity']
polarity_labels2 = annotated_data['Polarity 2']

valid_indices = (subjectivity_labels1 == 1) & (subjectivity_labels2 == 1)
filtered_original_polarity = polarity_labels1[valid_indices]
filtered_new_polarity = polarity_labels2[valid_indices]
filtered_new_polarity_numeric = np.array(filtered_new_polarity, dtype=float) 

kappa_polarity = cohen_kappa_score(filtered_new_polarity_numeric, filtered_original_polarity)

print("Cohen's kappa between annotator 1 and annotator 2 for polarity labels:", kappa_polarity)

Cohen's kappa between annotator 1 and annotator 2 for polarity labels: 0.8582205823592879


# Preprocessing Data
- Lowercasing
- Removing stopwords
- Replacing emoji 
- Replace slang/abbreviations with their text counterparts

<!-- - Mispellings -->

In [39]:
annotated_data[annotated_data['Comment'].isnull()]

Unnamed: 0,Brand,Search Term,Comment,Source,Metadata,Subjectivity,Polarity,Subjectivity 2,Polarity 2
2590,JW Anderson,JW Anderson,,Instagram,{'Likes_and_timestamp': '0 likes on 2023-11-20...,0.0,,0.0,


In [40]:
annotated_data = annotated_data.dropna(subset=['Comment'])
print(annotated_data.isnull().sum())

Brand              103
Search Term        174
Comment              0
Source               0
Metadata           234
Subjectivity         0
Polarity          1122
Subjectivity 2       0
Polarity 2        1140
dtype: int64


In [41]:
with open('abbreviations_list.pkl', 'rb') as file:
    abbreviations = pickle.load(file)

print(abbreviations)

{"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'migh

In [42]:
# Creating extra column for preprocessed text
annotated_data['Preprocessed Comment'] = annotated_data['Comment']

In [43]:
# Normalizing emojis
import emoji

def demojize_with_delimiters(text):
    return emoji.demojize(text, delimiters=(" ", " "))

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(lambda x: demojize_with_delimiters(x) if isinstance(x, str) else x)

In [44]:
# Example of emoji normalisation

import emoji

def demojize_with_delimiters(text):
    return emoji.demojize(text, delimiters=(" ", " "))

print(annotated_data['Comment'].iloc[2677])
print(demojize_with_delimiters(annotated_data['Comment'].iloc[2677]))

❤️❤️❤️
 red_heart  red_heart  red_heart 


In [45]:
# Lowercasing

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(lambda x: x.lower() if isinstance(x, str) else x)


In [46]:
# Removing stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    # Ensure the input is a string
    if isinstance(text, str):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)
        
        # Get the list of stopwords
        stop_words = set(stopwords.words('english'))
        
        # Remove stopwords from the tokenized words
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Join the filtered words back into a single string
        filtered_text = ' '.join(filtered_words)
        
        return filtered_text
    else:  
        return text

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Louis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Louis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# Function to manually tokenize text including punctuations
def custom_tokenize(text):
    # Regex pattern to match words (including contractions) and separate punctuation
    tokens = re.findall(r"[\w']+|[.,!?;]", text)
    return tokens

# Normalize slangs and abbreviations
def normalize_slangs_abbreviations_custom(text, slang_dict):
    if isinstance(text, str):
        tokens = custom_tokenize(text)
        normalized_tokens = [slang_dict.get(token.lower(), token) for token in tokens]
        # Reconstruct the text
        normalized_text = ' '.join(normalized_tokens).replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?")
        return normalized_text
    else:
        return text

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(lambda x: normalize_slangs_abbreviations_custom(x, abbreviations))

In [48]:
annotated_data

Unnamed: 0,Brand,Search Term,Comment,Source,Metadata,Subjectivity,Polarity,Subjectivity 2,Polarity 2,Preprocessed Comment
0,Nike,waste,Designing products with sustainability in mind...,Twitter,"{'Name': 'Angla Sicurella', 'Handle': '@AnglaS...",0.0,,0.0,,"designing products sustainability mind, like n..."
1,Nike,waste,Kirby would have been a waste of time - why ev...,Twitter,"{'Name': 'LisaKingWheless', 'Handle': '@Lisapc...",1.0,0.0,1.0,1.0,kirby would waste time even ask? plus adds coa...
2,Nike,waste,I wouldn’t spend another dollar at that theate...,Twitter,"{'Name': 'Sheila McSheilerton', 'Handle': '@sh...",1.0,0.0,1.0,0.0,spend another dollar theater. like buy nike gr...
3,Nike,waste,Call them back and tell them they’re lying bec...,Twitter,"{'Name': 'UncleChrissy', 'Handle': '@uncle_chr...",1.0,0.0,1.0,1.0,call back tell lying already. trying get real ...
4,Nike,waste,I’m really sitting here going in on myself..li...,Twitter,"{'Name': 'Jade ☥', 'Handle': '@jmerarity', 'Ti...",1.0,1.0,1.0,1.0,really sitting going.. like really going let b...
...,...,...,...,...,...,...,...,...,...,...
2678,Louis Vuitton,Louis Vuitton,❤️❤️❤️,Instagram,{'Likes_and_timestamp': '0 likes on 2024-01-17...,1.0,1.0,1.0,1.0,red_heart red_heart red_heart
2679,Tory Burch,Tory Burch,The pale pink in the 6th look is EVERYTHINGGGG...,Instagram,{'Likes_and_timestamp': '0 likes on 2023-09-16...,1.0,1.0,1.0,1.0,pale pink 6th look everythinggggg. cherry_blossom
2680,Yeezy,Yeezy,He said it himself this isn't the real Kanye s...,Instagram,{'Likes_and_timestamp': '0 likes on 2024-02-27...,0.0,,0.0,,said n't real kanye care imposter saying face_...
2681,Gucci,Gucci,😍😍😍,Instagram,{'Likes_and_timestamp': '3 likes on 2023-09-23...,1.0,1.0,1.0,1.0,smiling_face_with_heart eyes smiling_face_with...


In [49]:
print(annotated_data['Comment'].iloc[20])
print(annotated_data['Preprocessed Comment'].iloc[20])

WHY is Hermes even getting involved at the Lotus casino, seems like a damn waste of time – tho I know they're probably trying to give Luke more backstory before the finale
hermes even getting involved lotus casino, seems like damn waste time though know 're probably trying give luke backstory finale


# Random Guessing

In [55]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define the number of records
num_records = annotated_data.shape[0]

# Generate 'subjectivity' column with random values of 0 or 1\
subjectivity_values = np.random.randint(2, size=num_records)

# Generate 'polarity' column based on 'subjectivity'
polarity_values = np.where(subjectivity_values == 0, np.nan, np.random.randint(2, size=num_records))

# Create dataframe
data = {'Subjectivity': subjectivity_values, 'Polarity': polarity_values}
random_df = pd.DataFrame(data)


In [56]:
# Convert subjectivity values to integers for comparison
ground_truth_df = annotated_data.copy()

ground_truth_df['Subjectivity'] = ground_truth_df['Subjectivity'].astype(int)
random_df['Subjectivity'] = random_df['Subjectivity'].astype(int)

# Calculate metrics
accuracy = accuracy_score(ground_truth_df['Subjectivity'], random_df['Subjectivity'])
precision = precision_score(ground_truth_df['Subjectivity'], random_df['Subjectivity'])
recall = recall_score(ground_truth_df['Subjectivity'], random_df['Subjectivity'])
f1 = f1_score(ground_truth_df['Subjectivity'], random_df['Subjectivity'])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Drop rows with null values in the 'polarity' column for both dataframes
ground_truth_df = annotated_data.dropna(subset=['Polarity'])
random_df = random_df.dropna(subset=['Polarity'])

# Merge dataframes on the index to align rows based on non-null values in the 'polarity' column
merged_df = pd.merge(ground_truth_df, random_df, left_index=True, right_index=True, suffixes=('_gt', '_random'))

# Calculate metrics using the merged dataframe
accuracy = accuracy_score(merged_df['Polarity_gt'], merged_df['Polarity_random'])
precision = precision_score(merged_df['Polarity_gt'], merged_df['Polarity_random'])
recall = recall_score(merged_df['Polarity_gt'], merged_df['Polarity_random'])
f1 = f1_score(merged_df['Polarity_gt'], merged_df['Polarity_random'])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.5100671140939598
Precision: 0.5930408472012103
Recall: 0.5025641025641026
F1-score: 0.5440666204024983
Accuracy: 0.49165596919127086
Precision: 0.5336658354114713
Recall: 0.5059101654846335
F1-score: 0.5194174757281553


# Benchmark Models
- SVM
- Random Forest
- Logistic Regression

Features:
- word embeddings, 
- tf-idf, 
- n-gram
- combination of all three


## Subjectivity Detection

In [18]:
# FEATURE EXTRACTION

# Load pre-trained Word2Vec model
word_embeddings = gensim.downloader.load('word2vec-google-news-300')

# Extract comments and corresponding subjectivity labels
comments = annotated_data['Preprocessed Comment'].tolist()
labels = annotated_data['Subjectivity'].tolist()

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Unigrams and bigrams
tfidf_features = tfidf_vectorizer.fit_transform(comments)

# Convert text data to n-gram features
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Unigrams and bigrams
ngram_features = ngram_vectorizer.fit_transform(comments)

# Convert each comment to a vector representation using word embeddings
comment_vectors = []
for comment in comments:
    words = comment.split()
    vectors = [word_embeddings[word] for word in words if word in word_embeddings]
    if vectors:
        comment_vectors.append(sum(vectors) / len(vectors))  # Average of word vectors in the comment
    else:
        comment_vectors.append([0] * 300)  # Use zero vector if no word found


In [19]:
# Combine features
combined_features = hstack((tfidf_features, ngram_features, comment_vectors))

tf-idf

In [20]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.6919254658385093
Precision: 0.6885245901639344
Recall: 0.8307692307692308
F1-score: 0.7529880478087649
Time taken to train SVM model: 1.8377976417541504

Random Forest Metrics:
Accuracy: 0.6633540372670808
Precision: 0.652317880794702
Recall: 0.865934065934066
F1-score: 0.7440982058545798
Time taken to train Random Forest model: 17.83667230606079

Logistic Regression Metrics:
Accuracy: 0.6434782608695652
Precision: 0.6329113924050633
Recall: 0.8791208791208791
F1-score: 0.7359705611775529
Time taken to train Logistic Regression model: 0.2886679172515869





AdaBoost Metrics:
Accuracy: 0.662111801242236
Precision: 0.6631016042780749
Recall: 0.8175824175824176
F1-score: 0.7322834645669292
Time taken to train AdaBoost model: 21.03068208694458

XGBoost Metrics:
Accuracy: 0.6931677018633541
Precision: 0.7203389830508474
Recall: 0.7472527472527473
F1-score: 0.7335490830636462
Time taken to train XGBoost model: 4.992202043533325


n-grams

In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(ngram_features, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.6745341614906832
Precision: 0.6895874263261297
Recall: 0.7714285714285715
F1-score: 0.7282157676348547
Time taken to train SVM model: 1.6357707977294922

Random Forest Metrics:
Accuracy: 0.6720496894409937
Precision: 0.6672504378283712
Recall: 0.8373626373626374
F1-score: 0.7426900584795322
Time taken to train Random Forest model: 18.76086449623108


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Metrics:
Accuracy: 0.6708074534161491
Precision: 0.685546875
Recall: 0.7714285714285715
F1-score: 0.7259565667011375
Time taken to train Logistic Regression model: 1.5035247802734375

AdaBoost Metrics:
Accuracy: 0.6670807453416149
Precision: 0.6709323583180987
Recall: 0.8065934065934066
F1-score: 0.7325349301397206
Time taken to train AdaBoost model: 21.1769278049469

XGBoost Metrics:
Accuracy: 0.7167701863354037
Precision: 0.715370018975332
Recall: 0.8285714285714286
F1-score: 0.7678207739307535
Time taken to train XGBoost model: 3.929631471633911


word embeddings

In [22]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(comment_vectors, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.7006211180124223
Precision: 0.7106299212598425
Recall: 0.7934065934065934
F1-score: 0.7497403946002077
Time taken to train SVM model: 0.2684347629547119

Random Forest Metrics:
Accuracy: 0.6795031055900621
Precision: 0.6862003780718336
Recall: 0.7978021978021979
F1-score: 0.7378048780487805
Time taken to train Random Forest model: 1.7223725318908691

Logistic Regression Metrics:
Accuracy: 0.7118012422360248
Precision: 0.7173489278752436
Recall: 0.8087912087912088
F1-score: 0.7603305785123967
Time taken to train Logistic Regression model: 0.015833139419555664





AdaBoost Metrics:
Accuracy: 0.6683229813664596
Precision: 0.6902834008097166
Recall: 0.7494505494505495
F1-score: 0.7186512118018967
Time taken to train AdaBoost model: 3.066194534301758

XGBoost Metrics:
Accuracy: 0.6881987577639752
Precision: 0.7073170731707317
Recall: 0.7648351648351648
F1-score: 0.7349524815205913
Time taken to train XGBoost model: 1.2156713008880615


combined features

In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.7055900621118012
Precision: 0.7280334728033473
Recall: 0.7648351648351648
F1-score: 0.7459807073954984
Time taken to train SVM model: 5.089654207229614

Random Forest Metrics:
Accuracy: 0.6745341614906832
Precision: 0.6678260869565218
Recall: 0.843956043956044
F1-score: 0.7456310679611651
Time taken to train Random Forest model: 15.873722076416016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Metrics:
Accuracy: 0.7006211180124223
Precision: 0.7131474103585658
Recall: 0.7868131868131868
F1-score: 0.7481713688610241
Time taken to train Logistic Regression model: 2.9313454627990723

AdaBoost Metrics:
Accuracy: 0.6956521739130435
Precision: 0.7272727272727273
Recall: 0.7384615384615385
F1-score: 0.732824427480916
Time taken to train AdaBoost model: 45.2873694896698

XGBoost Metrics:
Accuracy: 0.7118012422360248
Precision: 0.7337526205450734
Recall: 0.7692307692307693
F1-score: 0.7510729613733905
Time taken to train XGBoost model: 14.095094919204712


Word embeddings consistently yield the best performance across all three models in terms of both accuracy and F1-score. This suggests that word embeddings capture the semantic meaning of words effectively, which is crucial for subjectivity detection in text data. They encode contextual information and relationships between words, potentially improving the model's ability to discern subjective content from objective content in the comments. Therefore, word embeddings seem to be the most effective feature extraction method for this task.

## Polarity Detection


In [24]:
annotated_polarity_data = annotated_data[annotated_data['Subjectivity']==1]
annotated_polarity_data.head()

Unnamed: 0,Brand,Search Term,Comment,Source,Metadata,Subjectivity,Polarity,Subjectivity 2,Polarity 2,Preprocessed Comment
1,Nike,waste,Kirby would have been a waste of time - why ev...,Twitter,"{'Name': 'LisaKingWheless', 'Handle': '@Lisapc...",1.0,0.0,1,1,kirby would waste time even ask? plus adds coa...
2,Nike,waste,I wouldn’t spend another dollar at that theate...,Twitter,"{'Name': 'Sheila McSheilerton', 'Handle': '@sh...",1.0,0.0,1,1,spend another dollar theater. like buy nike gr...
3,Nike,waste,Call them back and tell them they’re lying bec...,Twitter,"{'Name': 'UncleChrissy', 'Handle': '@uncle_chr...",1.0,0.0,1,1,call back tell lying already. trying get real ...
4,Nike,waste,I’m really sitting here going in on myself..li...,Twitter,"{'Name': 'Jade ☥', 'Handle': '@jmerarity', 'Ti...",1.0,1.0,0,0,really sitting going.. like really going let b...
5,Nike,waste,very disappointed in my new Nike Blazer Vint...,Twitter,"{'Name': 'Sailguy', 'Handle': '@NhSailguy', 'T...",1.0,0.0,0,0,disappointed new nike blazer vintage shoes. le...


In [25]:
# FEATURE EXTRACTION

# Load pre-trained Word2Vec model
word_embeddings = gensim.downloader.load('word2vec-google-news-300')

# Extract comments and corresponding subjectivity labels
comments = annotated_polarity_data['Preprocessed Comment'].tolist()
labels = annotated_polarity_data['Polarity'].tolist()

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Unigrams and bigrams
tfidf_features = tfidf_vectorizer.fit_transform(comments)

# Convert text data to n-gram features
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Unigrams and bigrams
ngram_features = ngram_vectorizer.fit_transform(comments)

# Convert each comment to a vector representation using word embeddings
comment_vectors = []
for comment in comments:
    words = comment.split()
    vectors = [word_embeddings[word] for word in words if word in word_embeddings]
    if vectors:
        comment_vectors.append(sum(vectors) / len(vectors))  # Average of word vectors in the comment
    else:
        comment_vectors.append([0] * 300)  # Use zero vector if no word found

# Combine features
combined_features = hstack((tfidf_features, ngram_features, comment_vectors))


In [26]:
with open("models/tfidf_vectorizer.pkl", 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

with open("models/ngram_vectorizer.pkl", 'wb') as file:
    pickle.dump(ngram_vectorizer, file)


tf-idf

In [27]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.7393162393162394
Precision: 0.7359154929577465
Recall: 0.81640625
F1-score: 0.774074074074074
Time taken to train SVM model: 0.5085470676422119

Random Forest Metrics:
Accuracy: 0.688034188034188
Precision: 0.6617647058823529
Recall: 0.87890625
F1-score: 0.7550335570469798
Time taken to train Random Forest model: 6.856877326965332

Logistic Regression Metrics:
Accuracy: 0.6965811965811965
Precision: 0.6676470588235294
Recall: 0.88671875
F1-score: 0.761744966442953
Time taken to train Logistic Regression model: 0.07468438148498535





AdaBoost Metrics:
Accuracy: 0.6623931623931624
Precision: 0.6611842105263158
Recall: 0.78515625
F1-score: 0.7178571428571429
Time taken to train AdaBoost model: 8.118350982666016

XGBoost Metrics:
Accuracy: 0.7222222222222222
Precision: 0.723404255319149
Recall: 0.796875
F1-score: 0.758364312267658
Time taken to train XGBoost model: 2.1200809478759766


n-gram

In [28]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(ngram_features, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.7008547008547008
Precision: 0.7116788321167883
Recall: 0.76171875
F1-score: 0.7358490566037735
Time taken to train SVM model: 0.49961090087890625

Random Forest Metrics:
Accuracy: 0.6837606837606838
Precision: 0.6708860759493671
Recall: 0.828125
F1-score: 0.7412587412587412
Time taken to train Random Forest model: 7.304869174957275


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Metrics:
Accuracy: 0.7264957264957265
Precision: 0.7424242424242424
Recall: 0.765625
F1-score: 0.7538461538461538
Time taken to train Logistic Regression model: 0.7456927299499512

AdaBoost Metrics:
Accuracy: 0.6752136752136753
Precision: 0.6710526315789473
Recall: 0.796875
F1-score: 0.7285714285714285
Time taken to train AdaBoost model: 8.337119817733765

XGBoost Metrics:
Accuracy: 0.7029914529914529
Precision: 0.7024221453287197
Recall: 0.79296875
F1-score: 0.744954128440367
Time taken to train XGBoost model: 1.8524141311645508


word embeddings

In [29]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(comment_vectors, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.7029914529914529
Precision: 0.7224334600760456
Recall: 0.7421875
F1-score: 0.7321772639691715
Time taken to train SVM model: 0.11271905899047852

Random Forest Metrics:
Accuracy: 0.7115384615384616
Precision: 0.712280701754386
Recall: 0.79296875
F1-score: 0.7504621072088724
Time taken to train Random Forest model: 1.000581979751587

Logistic Regression Metrics:
Accuracy: 0.7115384615384616
Precision: 0.7168458781362007
Recall: 0.78125
F1-score: 0.7476635514018691
Time taken to train Logistic Regression model: 0.008519411087036133





AdaBoost Metrics:
Accuracy: 0.6474358974358975
Precision: 0.6704119850187266
Recall: 0.69921875
F1-score: 0.6845124282982792
Time taken to train AdaBoost model: 1.7162683010101318

XGBoost Metrics:
Accuracy: 0.7136752136752137
Precision: 0.7420634920634921
Recall: 0.73046875
F1-score: 0.7362204724409449
Time taken to train XGBoost model: 0.8052747249603271


combined features

In [30]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel="linear")
svm_start_time = time.time()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)
svm_end_time = time.time()

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)
print("Time taken to train SVM model:", svm_end_time - svm_start_time)
# Train Random Forest model
rf_model = RandomForestClassifier()
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
rf_end_time = time.time()

# Calculate accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)
print("Time taken to train Random Forest model:", rf_end_time - rf_start_time)

# Train logistic regression model
logistic_model = LogisticRegression()
logistic_start_time = time.time()
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = logistic_model.predict(X_test)
logistic_end_time = time.time()

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1-score:", f1_log)
print("Time taken to train Logistic Regression model:", logistic_end_time - logistic_start_time)

with open("models/logistic_regression_polarity.pkl", 'wb') as file:
    pickle.dump(logistic_model, file)

ada_model = AdaBoostClassifier()
ada_start_time = time.time()
ada_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = ada_model.predict(X_test)
ada_end_time = time.time()

# Calculate metrics for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)
print("Time taken to train AdaBoost model:", ada_end_time - ada_start_time)

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_start_time = time.time()
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
xgb_end_time = time.time()

# Calculate metrics for XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Time taken to train XGBoost model:", xgb_end_time - xgb_start_time)

SVM Metrics:
Accuracy: 0.7264957264957265
Precision: 0.7318840579710145
Recall: 0.7890625
F1-score: 0.7593984962406015
Time taken to train SVM model: 1.4758880138397217

Random Forest Metrics:
Accuracy: 0.7136752136752137
Precision: 0.7163120567375887
Recall: 0.7890625
F1-score: 0.7509293680297398
Time taken to train Random Forest model: 8.064415454864502


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Metrics:
Accuracy: 0.7414529914529915
Precision: 0.7509293680297398
Recall: 0.7890625
F1-score: 0.7695238095238095
Time taken to train Logistic Regression model: 2.0073280334472656

AdaBoost Metrics:
Accuracy: 0.6944444444444444
Precision: 0.714828897338403
Recall: 0.734375
F1-score: 0.7244701348747592
Time taken to train AdaBoost model: 19.385746240615845

XGBoost Metrics:
Accuracy: 0.7200854700854701
Precision: 0.7470355731225297
Recall: 0.73828125
F1-score: 0.7426326129666012
Time taken to train XGBoost model: 7.483121633529663


Conclusion: 
- For subjectivity, XGBoost with n-grams performs best (f1-score of 0.768)
- For polarity, logistic regression with combined features performs best (f1-score of 0.769)

Comment:
- Models above are to be compared with other models investigated in other files.
