In [None]:
import pandas as pd
import numpy as np

# Assuming you have the file locally, replace 'path_to_your_file' with the actual file path
file_path = '/content/dataset_2000_Final.csv'
dataset = pd.read_csv(file_path)

# Display the first few rows to understand its structure
print(dataset.head())


  SERIAL NUMBER                                               BODY  \
0            S2  I did - it's not bad! (Prefer Moon X to it (I ...   
1            S4  the DLRC would be an acceptable replacement. I...   
2           S74  From my Total Wine seasonal job this past year...   
3           S83  Its getting better since 2018ish. Loads of win...   
4           S93  That’s so awesome!!! 👏🏽 I absolutely love that...   

            LABEL  
0  Sustainability  
1  Sustainability  
2  Sustainability  
3  Sustainability  
4  Sustainability  


In [None]:
# Shuffle the dataset
from sklearn.utils import shuffle
# Shuffle the dataset with a random seed for reproducibility
dataset = shuffle(dataset, random_state=42)

# Display the first few rows of the shuffled dataset to verify
print(dataset.head())

     SERIAL NUMBER                                               BODY  \
1860          B361           Nice taste. Rich. Little taste of cherry   
353          S4171  The main bit of this that confuses me is sayin...   
1333          R334  Wow your comment has rang true to me more than...   
905         P16811  I'm in the UK, we either have 26 or 29mm caps....   
1289          R290  Thank you, I really don't understand why all t...   

               LABEL  
1860     Brand Image  
353   Sustainability  
1333            Null  
905        Packaging  
1289            Null  


In [None]:
import re
import string
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure you have downloaded necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words, lemmatizer, and number-to-word converter
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
p = inflect.engine()

# Function to convert numbers to words, including handling decimals
def convert_number_to_words(text):
    def replace(match):
        number = match.group()
        if '.' in number:
            return p.number_to_words(float(number), decimal='point')
        else:
            return p.number_to_words(int(number))
    return re.sub(r'\d+(\.\d+)?', replace, text)

# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Convert numbers to words
    text = convert_number_to_words(text)
    # Remove punctuation and non-alphabetic characters, including backslashes and newlines
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stop words and lemmatize
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

# Apply the cleaning function to the 'Body' column
dataset['Cleaned_Body'] = dataset['BODY'].apply(clean_text)

# Display the first few rows of the cleaned dataset to verify
print(dataset.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


     SERIAL NUMBER                                               BODY  \
1860          B361           Nice taste. Rich. Little taste of cherry   
353          S4171  The main bit of this that confuses me is sayin...   
1333          R334  Wow your comment has rang true to me more than...   
905         P16811  I'm in the UK, we either have 26 or 29mm caps....   
1289          R290  Thank you, I really don't understand why all t...   

               LABEL                                       Cleaned_Body  
1860     Brand Image                nice taste rich little taste cherry  
353   Sustainability  main bit confuses saying people pick funky aro...  
1333            Null  wow comment rang true comment read reddit im p...  
905        Packaging  im uk either twentysix twentyninemm cap twenty...  
1289            Null  thank really dont understand people especially...  


In [None]:
from sklearn.model_selection import train_test_split

# Define the features (X) and the target (y)
X = dataset['Cleaned_Body']
y = dataset['LABEL']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the size of each set to verify
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 1600
Testing set size: 400


### BERT EMBEDDINGS WITH ALL MODELS

GENERATING BERT EMBEDDINGS BELOW:

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to extract BERT embeddings
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

# Extract embeddings for training and testing data
X_train_embeddings = get_bert_embeddings(X_train)
X_test_embeddings = get_bert_embeddings(X_test)

# Convert lists to numpy arrays
X_train_embeddings = np.array(X_train_embeddings)
X_test_embeddings = np.array(X_test_embeddings)

# Display the shape of the embeddings to verify
print(f"Training data embeddings shape: {X_train_embeddings.shape}")
print(f"Testing data embeddings shape: {X_test_embeddings.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Training data embeddings shape: (1600, 768)
Testing data embeddings shape: (400, 768)


SVM ON BERT EMBEDDINGS

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Initialize the SVM model
svm_model = SVC(kernel='linear', C=1, random_state=42)

# Define the K-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics using built-in scorer names
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# Perform cross-validation
cv_results_svm = {}
for metric in scoring_metrics:
    cv_results_svm[metric] = cross_val_score(svm_model, X_train_embeddings, y_train, cv=kfold, scoring=metric)

# Calculate mean and standard deviation for each metric
print("SVM Cross-Validation Results:")
for metric in cv_results_svm:
    mean_score = np.mean(cv_results_svm[metric])
    std_score = np.std(cv_results_svm[metric])
    print(f"Cross-Validation {metric.capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

# Train the model on the full training data
svm_model.fit(X_train_embeddings, y_train)

# Predict the labels for the testing set
y_pred_svm = svm_model.predict(X_test_embeddings)

# Evaluate the model's performance on the test set
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted', zero_division=0)
recall_svm = recall_score(y_test, y_pred_svm, average='weighted', zero_division=0)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted', zero_division=0)
report_svm = classification_report(y_test, y_pred_svm, zero_division=0)

# Display the evaluation metrics for the test set
print("\nSVM Results on Test Set with BERT Embeddings:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F1 Score: {f1_svm:.4f}")
print(f"Classification Report:\n{report_svm}")


SVM Cross-Validation Results:
Cross-Validation Accuracy: Mean = 0.8450, Std = 0.0127
Cross-Validation Precision_weighted: Mean = 0.8453, Std = 0.0121
Cross-Validation Recall_weighted: Mean = 0.8450, Std = 0.0127
Cross-Validation F1_weighted: Mean = 0.8441, Std = 0.0126

SVM Results on Test Set with BERT Embeddings:
Accuracy: 0.8275
Precision: 0.8294
Recall: 0.8275
F1 Score: 0.8272
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.93      0.95      0.94       100
          Null       0.75      0.83      0.79       100
     Packaging       0.80      0.78      0.79       100
Sustainability       0.84      0.75      0.79       100

      accuracy                           0.83       400
     macro avg       0.83      0.83      0.83       400
  weighted avg       0.83      0.83      0.83       400



### BERT WITH LR AND K-FOLD

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Initialize the logistic regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# Define the K-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics using built-in scorer names for weighted averages
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# Perform cross-validation
cv_results_log_reg = {}
for metric in scoring_metrics:
    cv_results_log_reg[metric] = cross_val_score(log_reg_model, X_train_embeddings, y_train, cv=kfold, scoring=metric)

# Calculate mean and standard deviation for each metric
print("Logistic Regression Cross-Validation Results:")
for metric in cv_results_log_reg:
    mean_score = np.mean(cv_results_log_reg[metric])
    std_score = np.std(cv_results_log_reg[metric])
    print(f"Cross-Validation {metric.capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

# Train the model on the full training data
log_reg_model.fit(X_train_embeddings, y_train)

# Predict the labels for the testing set
y_pred_log_reg = log_reg_model.predict(X_test_embeddings)

# Evaluate the model's performance on the test set
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
precision_log_reg = precision_score(y_test, y_pred_log_reg, average='weighted', zero_division=0)
recall_log_reg = recall_score(y_test, y_pred_log_reg, average='weighted', zero_division=0)
f1_log_reg = f1_score(y_test, y_pred_log_reg, average='weighted', zero_division=0)
report_log_reg = classification_report(y_test, y_pred_log_reg, zero_division=0)

# Display the evaluation metrics for the test set
print("\nLogistic Regression Results on Test Set with BERT Embeddings:")
print(f"Accuracy: {accuracy_log_reg:.4f}")
print(f"Precision: {precision_log_reg:.4f}")
print(f"Recall: {recall_log_reg:.4f}")
print(f"F1 Score: {f1_log_reg:.4f}")
print(f"Classification Report:\n{report_log_reg}")


Logistic Regression Cross-Validation Results:
Cross-Validation Accuracy: Mean = 0.8556, Std = 0.0102
Cross-Validation Precision_weighted: Mean = 0.8567, Std = 0.0107
Cross-Validation Recall_weighted: Mean = 0.8556, Std = 0.0102
Cross-Validation F1_weighted: Mean = 0.8549, Std = 0.0098

Logistic Regression Results on Test Set with BERT Embeddings:
Accuracy: 0.8625
Precision: 0.8627
Recall: 0.8625
F1 Score: 0.8624
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.95      0.96      0.96       100
          Null       0.84      0.84      0.84       100
     Packaging       0.81      0.84      0.82       100
Sustainability       0.85      0.81      0.83       100

      accuracy                           0.86       400
     macro avg       0.86      0.86      0.86       400
  weighted avg       0.86      0.86      0.86       400



### BERT WITH RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
import numpy as np

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Define scoring metrics for multi-class classification
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision_weighted': make_scorer(precision_score, average='weighted'),
    'recall_weighted': make_scorer(recall_score, average='weighted'),
    'f1_weighted': make_scorer(f1_score, average='weighted')
}

# Perform K-fold cross-validation
cv_results = cross_validate(
    rf_model,
    X_train_embeddings,
    y_train,
    cv=5,
    scoring=scoring_metrics,
    return_train_score=False
)

# Calculate mean and standard deviation for each metric
accuracy_mean = np.mean(cv_results['test_accuracy'])
accuracy_std = np.std(cv_results['test_accuracy'])
precision_mean = np.mean(cv_results['test_precision_weighted'])
precision_std = np.std(cv_results['test_precision_weighted'])
recall_mean = np.mean(cv_results['test_recall_weighted'])
recall_std = np.std(cv_results['test_recall_weighted'])
f1_mean = np.mean(cv_results['test_f1_weighted'])
f1_std = np.std(cv_results['test_f1_weighted'])

# Display the cross-validation results
print(f"Cross-Validation Accuracy: Mean = {accuracy_mean:.4f}, Std = {accuracy_std:.4f}")
print(f"Cross-Validation Precision: Mean = {precision_mean:.4f}, Std = {precision_std:.4f}")
print(f"Cross-Validation Recall: Mean = {recall_mean:.4f}, Std = {recall_std:.4f}")
print(f"Cross-Validation F1: Mean = {f1_mean:.4f}, Std = {f1_std:.4f}")

# Fit the Random Forest model on the entire training data
rf_model.fit(X_train_embeddings, y_train)

# Predict the labels for the testing set
y_pred = rf_model.predict(X_test_embeddings)

# Evaluate the model's performance on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

# Display the evaluation metrics for the test set
print("\nRandom Forest Results on Test Set with BERT Embeddings:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Classification Report:\n{report}")


Cross-Validation Accuracy: Mean = 0.7931, Std = 0.0174
Cross-Validation Precision: Mean = 0.7934, Std = 0.0191
Cross-Validation Recall: Mean = 0.7931, Std = 0.0174
Cross-Validation F1: Mean = 0.7922, Std = 0.0178

Random Forest Results on Test Set with BERT Embeddings:
Accuracy: 0.8025
Precision: 0.8031
Recall: 0.8025
F1 Score: 0.8021
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.88      0.91      0.90       100
          Null       0.73      0.79      0.76       100
     Packaging       0.78      0.73      0.76       100
Sustainability       0.81      0.78      0.80       100

      accuracy                           0.80       400
     macro avg       0.80      0.80      0.80       400
  weighted avg       0.80      0.80      0.80       400



### MODELS WITH TF-IDF as INPUTS

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer for trigrams
tfidf_vectorizer_trigrams = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

# Fit and transform the training data to get embeddings
X_train_tfidf_trigrams = tfidf_vectorizer_trigrams.fit_transform(X_train)

# Transform the testing data
X_test_tfidf_trigrams = tfidf_vectorizer_trigrams.transform(X_test)

# Display the shape of the embeddings to verify
print(f"Training data TF-IDF shape: {X_train_tfidf_trigrams.shape}")
print(f"Testing data TF-IDF shape: {X_test_tfidf_trigrams.shape}")


Training data TF-IDF shape: (1600, 5000)
Testing data TF-IDF shape: (400, 5000)


LR TF-IDF

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Initialize the logistic regression model
log_reg_tfidf_trigrams = LogisticRegression(max_iter=1000, random_state=42)

# Define the K-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics using built-in scorer names
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# Perform cross-validation
cv_results = {}
for metric in scoring_metrics:
    cv_results[metric] = cross_val_score(log_reg_tfidf_trigrams, X_train_tfidf_trigrams, y_train, cv=kfold, scoring=metric)

# Calculate mean and standard deviation for each metric
print("Logistic Regression Cross-Validation Results with TF-IDF Trigrams:")
for metric in cv_results:
    mean_score = np.mean(cv_results[metric])
    std_score = np.std(cv_results[metric])
    print(f"Cross-Validation {metric.replace('_', ' ').capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

# Train the model on the full training data
log_reg_tfidf_trigrams.fit(X_train_tfidf_trigrams, y_train)

# Predict the labels for the testing set
y_pred_tfidf_trigrams = log_reg_tfidf_trigrams.predict(X_test_tfidf_trigrams)

# Evaluate the model's performance on the test set
print("\nTF-IDF (Trigrams) + Logistic Regression Results on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf_trigrams):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_tfidf_trigrams)}")


Logistic Regression Cross-Validation Results with TF-IDF Trigrams:
Cross-Validation Accuracy: Mean = 0.9006, Std = 0.0111
Cross-Validation Precision weighted: Mean = 0.9034, Std = 0.0097
Cross-Validation Recall weighted: Mean = 0.9006, Std = 0.0111
Cross-Validation F1 weighted: Mean = 0.9005, Std = 0.0112

TF-IDF (Trigrams) + Logistic Regression Results on Test Set:
Accuracy: 0.9175
Precision: 0.9177
Recall: 0.9175
F1 Score: 0.9170
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.96      0.99      0.98       100
          Null       0.88      0.92      0.90       100
     Packaging       0.90      0.92      0.91       100
Sustainability       0.92      0.84      0.88       100

      accuracy                           0.92       400
     macro avg       0.92      0.92      0.92       400
  weighted avg       0.92      0.92      0.92       400



SVM TF-IDF

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Initialize the SVM model
svm_tfidf_trigrams = SVC(kernel='linear', random_state=42)

# Define the K-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics using built-in scorer names
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# Perform cross-validation
cv_results = {}
for metric in scoring_metrics:
    cv_results[metric] = cross_val_score(svm_tfidf_trigrams, X_train_tfidf_trigrams, y_train, cv=kfold, scoring=metric)

# Calculate mean and standard deviation for each metric
print("SVM Cross-Validation Results with TF-IDF Trigrams:")
for metric in cv_results:
    mean_score = np.mean(cv_results[metric])
    std_score = np.std(cv_results[metric])
    print(f"Cross-Validation {metric.replace('_', ' ').capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

# Train the model on the full training data
svm_tfidf_trigrams.fit(X_train_tfidf_trigrams, y_train)

# Predict the labels for the testing set
y_pred_tfidf_trigrams = svm_tfidf_trigrams.predict(X_test_tfidf_trigrams)

# Evaluate the model's performance on the test set
print("\nTF-IDF (Trigrams) + SVM Results on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf_trigrams):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_tfidf_trigrams)}")


SVM Cross-Validation Results with TF-IDF Trigrams:
Cross-Validation Accuracy: Mean = 0.9006, Std = 0.0113
Cross-Validation Precision weighted: Mean = 0.9028, Std = 0.0110
Cross-Validation Recall weighted: Mean = 0.9006, Std = 0.0113
Cross-Validation F1 weighted: Mean = 0.9005, Std = 0.0113

TF-IDF (Trigrams) + SVM Results on Test Set:
Accuracy: 0.9000
Precision: 0.9010
Recall: 0.9000
F1 Score: 0.8993
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.96      0.98      0.97       100
          Null       0.85      0.93      0.89       100
     Packaging       0.90      0.89      0.89       100
Sustainability       0.90      0.80      0.85       100

      accuracy                           0.90       400
     macro avg       0.90      0.90      0.90       400
  weighted avg       0.90      0.90      0.90       400



RF with TF-IDF

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Initialize the Random Forest model
rf_tfidf_trigrams = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the K-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics using built-in scorer names
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# Perform cross-validation
cv_results = {}
for metric in scoring_metrics:
    cv_results[metric] = cross_val_score(rf_tfidf_trigrams, X_train_tfidf_trigrams, y_train, cv=kfold, scoring=metric)

# Calculate mean and standard deviation for each metric
print("Random Forest Cross-Validation Results with TF-IDF Trigrams:")
for metric in cv_results:
    mean_score = np.mean(cv_results[metric])
    std_score = np.std(cv_results[metric])
    print(f"Cross-Validation {metric.replace('_', ' ').capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

# Train the model on the full training data
rf_tfidf_trigrams.fit(X_train_tfidf_trigrams, y_train)

# Predict the labels for the testing set
y_pred_tfidf_trigrams = rf_tfidf_trigrams.predict(X_test_tfidf_trigrams)

# Evaluate the model's performance on the test set
print("\nTF-IDF (Trigrams) + Random Forest Results on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf_trigrams):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_tfidf_trigrams, average='weighted'):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_tfidf_trigrams)}")


Random Forest Cross-Validation Results with TF-IDF Trigrams:
Cross-Validation Accuracy: Mean = 0.8925, Std = 0.0102
Cross-Validation Precision weighted: Mean = 0.8967, Std = 0.0107
Cross-Validation Recall weighted: Mean = 0.8925, Std = 0.0102
Cross-Validation F1 weighted: Mean = 0.8920, Std = 0.0103

TF-IDF (Trigrams) + Random Forest Results on Test Set:
Accuracy: 0.9125
Precision: 0.9150
Recall: 0.9125
F1 Score: 0.9113
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.93      1.00      0.96       100
          Null       0.89      0.88      0.88       100
     Packaging       0.88      0.96      0.92       100
Sustainability       0.96      0.81      0.88       100

      accuracy                           0.91       400
     macro avg       0.91      0.91      0.91       400
  weighted avg       0.91      0.91      0.91       400



### All models with BoW as input

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Function to vectorize text data using Count Vectorizer
def vectorize_text(train_data, test_data, max_features=5000):
    # Initialize the Count Vectorizer (Bag of Words)
    bow_vectorizer = CountVectorizer(max_features=max_features)

    # Fit and transform the training data
    X_train_bow = bow_vectorizer.fit_transform(train_data)

    # Transform the testing data
    X_test_bow = bow_vectorizer.transform(test_data)

    return X_train_bow, X_test_bow

# Vectorize the training and testing data
X_train_bow, X_test_bow = vectorize_text(X_train, X_test)


LR with BoW as input

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Function to perform K-Fold cross-validation and train logistic regression model
def train_evaluate_logistic_regression(X, y, cv_splits=5):
    # Initialize the logistic regression model
    model = LogisticRegression(max_iter=1000, random_state=42)

    # Define the K-fold cross-validation
    kfold = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)

    # Perform cross-validation
    cv_accuracy = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    cv_precision = cross_val_score(model, X, y, cv=kfold, scoring='precision_weighted')
    cv_recall = cross_val_score(model, X, y, cv=kfold, scoring='recall_weighted')
    cv_f1 = cross_val_score(model, X, y, cv=kfold, scoring='f1_weighted')

    print("Logistic Regression Cross-Validation Results:")
    print(f"Cross-Validation Accuracy: Mean = {np.mean(cv_accuracy):.4f}, Std = {np.std(cv_accuracy):.4f}")
    print(f"Cross-Validation Precision: Mean = {np.mean(cv_precision):.4f}, Std = {np.std(cv_precision):.4f}")
    print(f"Cross-Validation Recall: Mean = {np.mean(cv_recall):.4f}, Std = {np.std(cv_recall):.4f}")
    print(f"Cross-Validation F1: Mean = {np.mean(cv_f1):.4f}, Std = {np.std(cv_f1):.4f}")

    # Train the model on the full training data
    model.fit(X, y)

    return model

# Train and evaluate the logistic regression model with K-Fold cross-validation
log_reg_bow = train_evaluate_logistic_regression(X_train_bow, y_train)

# Evaluate the model on the test set
y_pred_bow = log_reg_bow.predict(X_test_bow)
print("\nBoW + Logistic Regression Results on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bow):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_bow, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_bow, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_bow, average='weighted'):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_bow)}\n")


Logistic Regression Cross-Validation Results:
Cross-Validation Accuracy: Mean = 0.8950, Std = 0.0129
Cross-Validation Precision: Mean = 0.8964, Std = 0.0134
Cross-Validation Recall: Mean = 0.8950, Std = 0.0129
Cross-Validation F1: Mean = 0.8946, Std = 0.0130

BoW + Logistic Regression Results on Test Set:
Accuracy: 0.8975
Precision: 0.9005
Recall: 0.8975
F1 Score: 0.8967
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.96      1.00      0.98       100
          Null       0.82      0.93      0.87       100
     Packaging       0.91      0.87      0.89       100
Sustainability       0.92      0.79      0.85       100

      accuracy                           0.90       400
     macro avg       0.90      0.90      0.90       400
  weighted avg       0.90      0.90      0.90       400




SVC with BoW as input

In [None]:
# Function to perform K-Fold cross-validation and train SVM model
def train_evaluate_svm(X, y, cv_splits=5):
    # Initialize the SVM model with a linear kernel
    model = SVC(kernel='linear', random_state=42)

    # Define the Stratified K-Fold cross-validation
    kfold = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)

    # Perform cross-validation
    cv_accuracy = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    cv_precision = cross_val_score(model, X, y, cv=kfold, scoring='precision_weighted')
    cv_recall = cross_val_score(model, X, y, cv=kfold, scoring='recall_weighted')
    cv_f1 = cross_val_score(model, X, y, cv=kfold, scoring='f1_weighted')

    print("SVM Cross-Validation Results:")
    print(f"Cross-Validation Accuracy: Mean = {np.mean(cv_accuracy):.4f}, Std = {np.std(cv_accuracy):.4f}")
    print(f"Cross-Validation Precision: Mean = {np.mean(cv_precision):.4f}, Std = {np.std(cv_precision):.4f}")
    print(f"Cross-Validation Recall: Mean = {np.mean(cv_recall):.4f}, Std = {np.std(cv_recall):.4f}")
    print(f"Cross-Validation F1: Mean = {np.mean(cv_f1):.4f}, Std = {np.std(cv_f1):.4f}")

    # Train the model on the full training data
    model.fit(X, y)

    return model

# Train and evaluate the SVM model with K-Fold cross-validation
svm_bow = train_evaluate_svm(X_train_bow, y_train)

# Evaluate the model on the test set
y_pred_svm = svm_bow.predict(X_test_bow)
print("\nBoW + SVM Results on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_svm, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_svm, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_svm, average='weighted'):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_svm)}\n")


SVM Cross-Validation Results:
Cross-Validation Accuracy: Mean = 0.8869, Std = 0.0143
Cross-Validation Precision: Mean = 0.8879, Std = 0.0147
Cross-Validation Recall: Mean = 0.8869, Std = 0.0143
Cross-Validation F1: Mean = 0.8865, Std = 0.0145

BoW + SVM Results on Test Set:
Accuracy: 0.8975
Precision: 0.9018
Recall: 0.8975
F1 Score: 0.8971
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.93      1.00      0.97       100
          Null       0.81      0.92      0.86       100
     Packaging       0.94      0.85      0.89       100
Sustainability       0.92      0.82      0.87       100

      accuracy                           0.90       400
     macro avg       0.90      0.90      0.90       400
  weighted avg       0.90      0.90      0.90       400




Random Forest with BoW as input

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Function to perform K-Fold cross-validation and train Random Forest model
def train_evaluate_random_forest(X, y, cv_splits=5):
    # Initialize the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Define the Stratified K-Fold cross-validation
    kfold = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)

    # Perform cross-validation
    cv_accuracy = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    cv_precision = cross_val_score(model, X, y, cv=kfold, scoring='precision_weighted')
    cv_recall = cross_val_score(model, X, y, cv=kfold, scoring='recall_weighted')
    cv_f1 = cross_val_score(model, X, y, cv=kfold, scoring='f1_weighted')

    print("Random Forest Cross-Validation Results:")
    print(f"Cross-Validation Accuracy: Mean = {np.mean(cv_accuracy):.4f}, Std = {np.std(cv_accuracy):.4f}")
    print(f"Cross-Validation Precision: Mean = {np.mean(cv_precision):.4f}, Std = {np.std(cv_precision):.4f}")
    print(f"Cross-Validation Recall: Mean = {np.mean(cv_recall):.4f}, Std = {np.std(cv_recall):.4f}")
    print(f"Cross-Validation F1: Mean = {np.mean(cv_f1):.4f}, Std = {np.std(cv_f1):.4f}")

    # Train the model on the full training data
    model.fit(X, y)

    return model

# Train and evaluate the Random Forest model with K-Fold cross-validation
rf_bow = train_evaluate_random_forest(X_train_bow, y_train)

# Evaluate the model on the test set
y_pred_rf = rf_bow.predict(X_test_bow)
print("\nBoW + Random Forest Results on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_rf)}\n")


Random Forest Cross-Validation Results:
Cross-Validation Accuracy: Mean = 0.8838, Std = 0.0096
Cross-Validation Precision: Mean = 0.8864, Std = 0.0098
Cross-Validation Recall: Mean = 0.8838, Std = 0.0096
Cross-Validation F1: Mean = 0.8825, Std = 0.0097

BoW + Random Forest Results on Test Set:
Accuracy: 0.9050
Precision: 0.9077
Recall: 0.9050
F1 Score: 0.9039
Classification Report:
                precision    recall  f1-score   support

   Brand Image       0.91      1.00      0.95       100
          Null       0.88      0.87      0.87       100
     Packaging       0.88      0.94      0.91       100
Sustainability       0.96      0.81      0.88       100

      accuracy                           0.91       400
     macro avg       0.91      0.91      0.90       400
  weighted avg       0.91      0.91      0.90       400


