In [None]:
import torch
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load BERTweet tokenizer and model
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model =  AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
# Load in other necessary packages
import emoji
import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import nltk
import sklearn
from sklearn.datasets import make_classification
from emoji import UNICODE_EMOJI

In [None]:
# Load in the desired dataset and ensure that age column is an int
dataset = pd.read_csv('filepath')
dataset['age'] = dataset['age'].astype(int)
print(dataset)

In [None]:
# Tokenize and encode text data
dataset['encoded_text'] = dataset['clean'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, padding=True))


In [None]:
'''
Text Encoding and Padding: The text data was previously encoded and padded using BERTweet embeddings. 
This encoding and padding process converts the text into numerical representations, and the padding ensures 
that all input sequences have the same length.
This transformation allows you to use the embeddings directly as features for your model.'''

from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

# Convert the 'encoded_text' column to a list of tensors
encoded_tensors = [torch.tensor(encoded) for encoded in dataset['encoded_text']]

# Pad the embeddings to ensure they have the same length within each batch
padded_embeddings = pad_sequence(encoded_tensors, batch_first=True)

# Create a TensorDataset with padded embeddings and age labels
dataset = TensorDataset(padded_embeddings, torch.tensor(dataset['age'].tolist()))

# Create a DataLoader with the padded embeddings and age labels
batch_size = 64  # Adjust as needed 
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X = padded_embeddings.numpy()  # Convert to NumPy array for XGBoost
y = augmented_dataset['age'].values - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize the xgboost model, change parameters as needed
model = xgb.XGBClassifier(objective='multi:softmax',  # For multi-class classification
    num_class=len(np.unique(y_train)),
    max_depth=6,                # Maximum depth of each tree
    n_estimators=500,           # Number of boosting rounds
    learning_rate=0.01,          # Learning rate (step size shrinkage)
    subsample=0.8,              # Fraction of samples used for training each tree
    colsample_bytree=0.8,       # Fraction of features used for training each tree
    random_state=42 )            # Seed for reproducibility this is what we added for a tf-idf classifier on the same dataset, what should i use from here?)


In [None]:
# Train the model on the dataset
model.fit(X_train, y_train)

In [None]:
# Test the model on the test set, measure accuracy
y_pred = model.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Overall precision, recall, and F1 scores
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, f1_score

# define the positive class
pos_label = 1

# calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, pos_label=pos_label, average='weighted')

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1_score)

In [None]:
# Calculate precision, recall, and F1 score for each class
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

for class_label in range(5):
    print(f"Class {class_label}:")
    print(f"Precision: {precision[class_label]}")
    print(f"Recall: {recall[class_label]}")
    print(f"F1 Score: {f1[class_label]}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Visualizing the scores for each class
# List of class labels (e.g., Class 0, Class 1, Class 2)
class_labels = [f'Class {i}' for i in range(len(precision))]

# Values for precision, recall, and F1 score for each class
precision_values = precision
recall_values = recall
f1_values = f1

# Create subplots for precision, recall, and F1 score
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 12))

# Plot precision
axes[0].bar(class_labels, precision_values, color='b', alpha=0.7)
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision for Each Class')

# Plot recall
axes[1].bar(class_labels, recall_values, color='g', alpha=0.7)
axes[1].set_ylabel('Recall')
axes[1].set_title('Recall for Each Class')

# Plot F1 score
axes[2].bar(class_labels, f1_values, color='r', alpha=0.7)
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score for Each Class')

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()