<a href="https://colab.research.google.com/github/ChiccoSy/BERT_Based_Multiclass_Text_Classification/blob/main/roBERTa_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install numpy
!pip install torch
!pip install nltk
!pip install scikit-learn
!pip install transformers
!pip install beautifulsoup4
!pip install contractions



In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted1.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5804325342178345, Average Training Loss: 1.5804325342178345, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6142877340316772, Average Training Loss: 1.5973601341247559, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.6518917083740234, Average Training Loss: 1.6155373255411785, Training Accuracy: 0.14583333333333334
Epoch 1/1, Batch Loss: 1.572295904159546, Average Training Loss: 1.6047269701957703, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.667406439781189, Average Training Loss: 1.617262864112854, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.6565629243850708, Average Training Loss: 1.6238128741582234, Training Accuracy: 0.19791666666666666
Epoch 1/1, Batch Loss: 1.596098780632019, Average Training Loss: 1.6198537179401942, Training Accuracy: 0.19642857142857142
Epoch 1/1, Batch Loss: 1.648276925086975, Average Training Loss: 1.6234066188335419, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.5919280052185059, Average Training Loss: 1.

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.5939849624060151
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.38      0.48       132
                Educational Opportunity       0.43      0.57      0.49       138
                         Family Support       0.82      0.79      0.80       133
                      Financial Support       0.61      0.75      0.67       130
                 Program Implementation       0.54      0.49      0.51       132

                               accuracy                           0.59       665
                              macro avg       0.61      0.59      0.59       665
                           weighted avg       0.61      0.59      0.59       665

Test Confusion Matrix:
[[ 50  34   3  14  31]
 [ 16  78   6  24  14]
 [  1   9 105  14   4]
 [  2  17   7  97   7]
 [  6  45   7   9  65]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted2.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6104053258895874, Average Training Loss: 1.6104053258895874, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.595860481262207, Average Training Loss: 1.6031329035758972, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.616713047027588, Average Training Loss: 1.6076596180597942, Training Accuracy: 0.2708333333333333
Epoch 1/1, Batch Loss: 1.6089473962783813, Average Training Loss: 1.607981562614441, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6061122417449951, Average Training Loss: 1.6076076984405518, Training Accuracy: 0.2375
Epoch 1/1, Batch Loss: 1.6073731184005737, Average Training Loss: 1.6075686017672222, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6311417818069458, Average Training Loss: 1.610936198915754, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6222739219665527, Average Training Loss: 1.6123534142971039, Training Accuracy: 0.2421875
Epoch 1/1, Batch Loss: 1.6038198471069336, Average Training Loss: 1.6114052401648626, Training Accur

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6105263157894737
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.53      0.48      0.50       132
                Educational Opportunity       0.44      0.54      0.49       138
                         Family Support       0.79      0.92      0.85       133
                      Financial Support       0.67      0.68      0.68       130
                 Program Implementation       0.64      0.42      0.51       132

                               accuracy                           0.61       665
                              macro avg       0.61      0.61      0.61       665
                           weighted avg       0.61      0.61      0.60       665

Test Confusion Matrix:
[[ 63  30   8  13  18]
 [ 28  75   4  21  10]
 [  3   4 123   1   2]
 [  7  17  15  89   2]
 [ 18  44   6   8  56]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted3.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6109570264816284, Average Training Loss: 1.6109570264816284, Training Accuracy: 0.09375
Epoch 1/1, Batch Loss: 1.5908795595169067, Average Training Loss: 1.6009182929992676, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6104825735092163, Average Training Loss: 1.6041063865025837, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.606114149093628, Average Training Loss: 1.6046083271503448, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6165845394134521, Average Training Loss: 1.6070035696029663, Training Accuracy: 0.18125
Epoch 1/1, Batch Loss: 1.6011797189712524, Average Training Loss: 1.606032927831014, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.5863646268844604, Average Training Loss: 1.603223170552935, Training Accuracy: 0.19196428571428573
Epoch 1/1, Batch Loss: 1.6757478713989258, Average Training Loss: 1.6122887581586838, Training Accuracy: 0.18359375
Epoch 1/1, Batch Loss: 1.6546460390090942, Average Training Loss: 1.616995122697618

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/1, Validation Loss: 26.646324515342712, Validation Accuracy: 0.325187969924812
Validation Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       1.00      0.03      0.05       112
                Educational Opportunity       0.30      0.84      0.44       102
                         Family Support       0.33      0.70      0.45       110
                      Financial Support       0.00      0.00      0.00       106
                 Program Implementation       0.88      0.07      0.13       102

                               accuracy                           0.33       532
                              macro avg       0.50      0.33      0.21       532
                           weighted avg       0.50      0.33      0.21       532

Validation Confusion Matrix:
[[ 3 86 23  0  0]
 [ 0 86 15  0  1]
 [ 0 33 77  0  0]
 [ 0 35 71  0  0]
 [ 0 46 49  0  7]]
Test Accuracy: 0.3323308270

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted4.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5809260606765747, Average Training Loss: 1.5809260606765747, Training Accuracy: 0.34375
Epoch 1/1, Batch Loss: 1.6074559688568115, Average Training Loss: 1.5941910147666931, Training Accuracy: 0.296875
Epoch 1/1, Batch Loss: 1.641166090965271, Average Training Loss: 1.6098493734995525, Training Accuracy: 0.23958333333333334
Epoch 1/1, Batch Loss: 1.6016418933868408, Average Training Loss: 1.6077975034713745, Training Accuracy: 0.2421875
Epoch 1/1, Batch Loss: 1.6161024570465088, Average Training Loss: 1.6094584941864014, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.5912108421325684, Average Training Loss: 1.606417218844096, Training Accuracy: 0.23958333333333334
Epoch 1/1, Batch Loss: 1.575859785079956, Average Training Loss: 1.6020518711635046, Training Accuracy: 0.24107142857142858
Epoch 1/1, Batch Loss: 1.6328277587890625, Average Training Loss: 1.6058988571166992, Training Accuracy: 0.23828125
Epoch 1/1, Batch Loss: 1.5981264114379883, Average Training

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.5263157894736842
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.48      0.61      0.54       132
                Educational Opportunity       0.39      0.41      0.40       138
                         Family Support       0.58      0.90      0.71       133
                      Financial Support       0.67      0.35      0.46       130
                 Program Implementation       0.60      0.36      0.45       132

                               accuracy                           0.53       665
                              macro avg       0.55      0.53      0.51       665
                           weighted avg       0.54      0.53      0.51       665

Test Confusion Matrix:
[[ 81  22  10   5  14]
 [ 48  56  17   9   8]
 [  4   4 120   2   3]
 [ 10  17  51  45   7]
 [ 26  43   9   6  48]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted5.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5672554969787598, Average Training Loss: 1.5672554969787598, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.6052643060684204, Average Training Loss: 1.58625990152359, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6280006170272827, Average Training Loss: 1.6001734733581543, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6083532571792603, Average Training Loss: 1.6022184193134308, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.623795747756958, Average Training Loss: 1.6065338850021362, Training Accuracy: 0.2125
Epoch 1/1, Batch Loss: 1.6640318632125854, Average Training Loss: 1.6161168813705444, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6090353727340698, Average Training Loss: 1.6151052372796195, Training Accuracy: 0.17857142857142858
Epoch 1/1, Batch Loss: 1.5695958137512207, Average Training Loss: 1.6094165593385696, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.5749106407165527, Average Training Loss: 1.6055825683805678, Training 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6736842105263158
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.74      0.51      0.60       132
                Educational Opportunity       0.45      0.62      0.52       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.67      0.82      0.74       130
                 Program Implementation       0.67      0.44      0.53       132

                               accuracy                           0.67       665
                              macro avg       0.70      0.67      0.67       665
                           weighted avg       0.69      0.67      0.67       665

Test Confusion Matrix:
[[ 67  35   2  15  13]
 [ 15  86   5  22  10]
 [  0   0 130   3   0]
 [  0  17   1 107   5]
 [  9  53   0  12  58]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted6.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6070674657821655, Average Training Loss: 1.6070674657821655, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6123425960540771, Average Training Loss: 1.6097050309181213, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.5848644971847534, Average Training Loss: 1.6014248530069988, Training Accuracy: 0.3125
Epoch 1/1, Batch Loss: 1.5938575267791748, Average Training Loss: 1.5995330214500427, Training Accuracy: 0.328125
Epoch 1/1, Batch Loss: 1.569204568862915, Average Training Loss: 1.5934673309326173, Training Accuracy: 0.3375
Epoch 1/1, Batch Loss: 1.590314269065857, Average Training Loss: 1.5929418206214905, Training Accuracy: 0.3333333333333333
Epoch 1/1, Batch Loss: 1.5801208019256592, Average Training Loss: 1.591110246522086, Training Accuracy: 0.33035714285714285
Epoch 1/1, Batch Loss: 1.6058239936828613, Average Training Loss: 1.592949464917183, Training Accuracy: 0.3203125
Epoch 1/1, Batch Loss: 1.5811197757720947, Average Training Loss: 1.59163505501217

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6917293233082706
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.57      0.64       132
                Educational Opportunity       0.49      0.56      0.52       138
                         Family Support       0.93      0.96      0.94       133
                      Financial Support       0.68      0.81      0.74       130
                 Program Implementation       0.66      0.57      0.61       132

                               accuracy                           0.69       665
                              macro avg       0.70      0.69      0.69       665
                           weighted avg       0.70      0.69      0.69       665

Test Confusion Matrix:
[[ 75  25   3  11  18]
 [ 22  77   5  20  14]
 [  0   2 128   3   0]
 [  3  14   2 105   6]
 [  4  38   0  15  75]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted7.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6084710359573364, Average Training Loss: 1.6084710359573364, Training Accuracy: 0.09375
Epoch 1/1, Batch Loss: 1.5939891338348389, Average Training Loss: 1.6012300848960876, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6178382635116577, Average Training Loss: 1.606766144434611, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.5987690687179565, Average Training Loss: 1.6047668755054474, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.6086698770523071, Average Training Loss: 1.6055474758148194, Training Accuracy: 0.225
Epoch 1/1, Batch Loss: 1.5959422588348389, Average Training Loss: 1.603946606318156, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.5977205038070679, Average Training Loss: 1.6030571631022863, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6037483215332031, Average Training Loss: 1.6031435579061508, Training Accuracy: 0.25390625
Epoch 1/1, Batch Loss: 1.5928425788879395, Average Training Loss: 1.601999004681905, Training Acc

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.643609022556391
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.58      0.46      0.51       132
                Educational Opportunity       0.46      0.63      0.53       138
                         Family Support       0.93      0.95      0.94       133
                      Financial Support       0.70      0.73      0.71       130
                 Program Implementation       0.58      0.45      0.51       132

                               accuracy                           0.64       665
                              macro avg       0.65      0.64      0.64       665
                           weighted avg       0.65      0.64      0.64       665

Test Confusion Matrix:
[[ 61  34   2  11  24]
 [ 17  87   5  17  12]
 [  1   4 126   2   0]
 [  9  18   2  95   6]
 [ 17  45   0  11  59]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/roB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted8.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6003516912460327, Average Training Loss: 1.6003516912460327, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.6254518032073975, Average Training Loss: 1.612901747226715, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6453078985214233, Average Training Loss: 1.6237037976582844, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.627667784690857, Average Training Loss: 1.6246947944164276, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.5920274257659912, Average Training Loss: 1.6181613206863403, Training Accuracy: 0.2125
Epoch 1/1, Batch Loss: 1.6116113662719727, Average Training Loss: 1.617069661617279, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.582037091255188, Average Training Loss: 1.612065008708409, Training Accuracy: 0.23214285714285715
Epoch 1/1, Batch Loss: 1.61283540725708, Average Training Loss: 1.6121613085269928, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.5823105573654175, Average Training Loss: 1.608844558397

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6345864661654136
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.49      0.57       132
                Educational Opportunity       0.41      0.67      0.51       138
                         Family Support       0.88      0.98      0.93       133
                      Financial Support       0.73      0.65      0.69       130
                 Program Implementation       0.64      0.37      0.47       132

                               accuracy                           0.63       665
                              macro avg       0.67      0.63      0.63       665
                           weighted avg       0.66      0.63      0.63       665

Test Confusion Matrix:
[[ 65  41   3   9  14]
 [ 18  93   5  11  11]
 [  0   1 130   2   0]
 [  2  31   9  85   3]
 [ 12  61   0  10  49]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted9.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.594150424003601, Average Training Loss: 1.594150424003601, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.6313831806182861, Average Training Loss: 1.6127668023109436, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.649061679840088, Average Training Loss: 1.6248650948206584, Training Accuracy: 0.19791666666666666
Epoch 1/1, Batch Loss: 1.6358848810195923, Average Training Loss: 1.6276200413703918, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.6230710744857788, Average Training Loss: 1.6267102479934692, Training Accuracy: 0.16875
Epoch 1/1, Batch Loss: 1.6078752279281616, Average Training Loss: 1.6235710779825847, Training Accuracy: 0.16666666666666666
Epoch 1/1, Batch Loss: 1.6102824211120605, Average Training Loss: 1.6216726984296526, Training Accuracy: 0.16071428571428573
Epoch 1/1, Batch Loss: 1.5816913843154907, Average Training Loss: 1.6166750341653824, Training Accuracy: 0.1796875
Epoch 1/1, Batch Loss: 1.603232502937317, Average Training Lo

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6586466165413534
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.55      0.59      0.57       132
                Educational Opportunity       0.49      0.43      0.46       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.64      0.84      0.73       130
                 Program Implementation       0.64      0.46      0.54       132

                               accuracy                           0.66       665
                              macro avg       0.65      0.66      0.65       665
                           weighted avg       0.65      0.66      0.65       665

Test Confusion Matrix:
[[ 78  18   2  15  19]
 [ 35  59   5  27  12]
 [  0   0 131   2   0]
 [  7   9   1 109   4]
 [ 21  34   0  16  61]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted10.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6160144805908203, Average Training Loss: 1.6160144805908203, Training Accuracy: 0.125
Epoch 1/1, Batch Loss: 1.6316399574279785, Average Training Loss: 1.6238272190093994, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.6288142204284668, Average Training Loss: 1.6254895528157551, Training Accuracy: 0.15625
Epoch 1/1, Batch Loss: 1.6114614009857178, Average Training Loss: 1.6219825148582458, Training Accuracy: 0.1640625
Epoch 1/1, Batch Loss: 1.5999025106430054, Average Training Loss: 1.6175665140151978, Training Accuracy: 0.175
Epoch 1/1, Batch Loss: 1.5995910167694092, Average Training Loss: 1.6145705978075664, Training Accuracy: 0.19270833333333334
Epoch 1/1, Batch Loss: 1.5958341360092163, Average Training Loss: 1.611893960407802, Training Accuracy: 0.19642857142857142
Epoch 1/1, Batch Loss: 1.5885621309280396, Average Training Loss: 1.6089774817228317, Training Accuracy: 0.203125
Epoch 1/1, Batch Loss: 1.6046643257141113, Average Training Loss: 1.608498

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6857142857142857
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.55      0.61       132
                Educational Opportunity       0.49      0.57      0.53       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.75      0.71      0.73       130
                 Program Implementation       0.59      0.62      0.61       132

                               accuracy                           0.69       665
                              macro avg       0.69      0.69      0.69       665
                           weighted avg       0.69      0.69      0.69       665

Test Confusion Matrix:
[[ 73  23   3   7  26]
 [ 24  78   5  12  19]
 [  0   0 131   2   0]
 [  4  22   1  92  11]
 [  5  36   0   9  82]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=4e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted11.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.577794075012207, Average Training Loss: 1.577794075012207, Training Accuracy: 0.34375
Epoch 1/1, Batch Loss: 1.599288821220398, Average Training Loss: 1.5885414481163025, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.5997827053070068, Average Training Loss: 1.5922885338465373, Training Accuracy: 0.23958333333333334
Epoch 1/1, Batch Loss: 1.6596125364303589, Average Training Loss: 1.6091195344924927, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.6302276849746704, Average Training Loss: 1.6133411645889282, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6556000709533691, Average Training Loss: 1.6203843156496684, Training Accuracy: 0.17708333333333334
Epoch 1/1, Batch Loss: 1.5819849967956543, Average Training Loss: 1.6148986986705236, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.6517183780670166, Average Training Loss: 1.6195011585950851, Training Accuracy: 0.18359375
Epoch 1/1, Batch Loss: 1.6867098808288574, Average Training Loss: 1.6269687

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6721804511278195
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.73      0.48      0.58       132
                Educational Opportunity       0.45      0.67      0.53       138
                         Family Support       0.93      0.98      0.95       133
                      Financial Support       0.72      0.78      0.75       130
                 Program Implementation       0.66      0.45      0.54       132

                               accuracy                           0.67       665
                              macro avg       0.70      0.67      0.67       665
                           weighted avg       0.69      0.67      0.67       665

Test Confusion Matrix:
[[ 64  37   3  11  17]
 [ 17  92   5  17   7]
 [  0   1 130   2   0]
 [  2  18   2 101   7]
 [  5  58   0   9  60]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=4e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted12.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6211448907852173, Average Training Loss: 1.6211448907852173, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.566463589668274, Average Training Loss: 1.5938042402267456, Training Accuracy: 0.3125
Epoch 1/1, Batch Loss: 1.5809415578842163, Average Training Loss: 1.5895166794459026, Training Accuracy: 0.34375
Epoch 1/1, Batch Loss: 1.58073890209198, Average Training Loss: 1.5873222351074219, Training Accuracy: 0.3203125
Epoch 1/1, Batch Loss: 1.624135136604309, Average Training Loss: 1.5946848154067994, Training Accuracy: 0.2875
Epoch 1/1, Batch Loss: 1.5879201889038086, Average Training Loss: 1.5935573776563008, Training Accuracy: 0.2708333333333333
Epoch 1/1, Batch Loss: 1.677852749824524, Average Training Loss: 1.6055995736803328, Training Accuracy: 0.25892857142857145
Epoch 1/1, Batch Loss: 1.626306176185608, Average Training Loss: 1.6081878989934921, Training Accuracy: 0.25390625
Epoch 1/1, Batch Loss: 1.6539422273635864, Average Training Loss: 1.61327171

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6541353383458647
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.60      0.50      0.55       132
                Educational Opportunity       0.45      0.51      0.48       138
                         Family Support       0.93      0.98      0.95       133
                      Financial Support       0.66      0.81      0.73       130
                 Program Implementation       0.64      0.48      0.55       132

                               accuracy                           0.65       665
                              macro avg       0.66      0.66      0.65       665
                           weighted avg       0.65      0.65      0.65       665

Test Confusion Matrix:
[[ 66  28   3  14  21]
 [ 28  71   5  23  11]
 [  1   0 130   2   0]
 [  6  14   2 105   3]
 [  9  46   0  14  63]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted13.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.626603603363037, Average Training Loss: 1.626603603363037, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.634104609489441, Average Training Loss: 1.630354106426239, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.625759243965149, Average Training Loss: 1.6288224856058757, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.5976922512054443, Average Training Loss: 1.6210399270057678, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.580712914466858, Average Training Loss: 1.6129745244979858, Training Accuracy: 0.175
Epoch 1/1, Batch Loss: 1.6065125465393066, Average Training Loss: 1.6118975281715393, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.6158901453018188, Average Training Loss: 1.6124679020472936, Training Accuracy: 0.19642857142857142
Epoch 1/1, Batch Loss: 1.6518419981002808, Average Training Loss: 1.617389664053917, Training Accuracy: 0.1953125
Epoch 1/1, Batch Loss: 1.5532878637313843, Average Training Loss: 1.6102672417958577

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7022556390977444
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.65      0.64      0.65       132
                Educational Opportunity       0.50      0.55      0.53       138
                         Family Support       0.93      0.99      0.96       133
                      Financial Support       0.71      0.83      0.76       130
                 Program Implementation       0.74      0.50      0.60       132

                               accuracy                           0.70       665
                              macro avg       0.71      0.70      0.70       665
                           weighted avg       0.71      0.70      0.70       665

Test Confusion Matrix:
[[ 85  24   3  10  10]
 [ 27  76   6  19  10]
 [  0   0 132   1   0]
 [  6  12   1 108   3]
 [ 12  39   0  15  66]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted14.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6195844411849976, Average Training Loss: 1.6195844411849976, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.612261176109314, Average Training Loss: 1.6159228086471558, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.5976306200027466, Average Training Loss: 1.6098254124323528, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.619935154914856, Average Training Loss: 1.6123528480529785, Training Accuracy: 0.25
Epoch 1/1, Batch Loss: 1.5577141046524048, Average Training Loss: 1.6014250993728638, Training Accuracy: 0.3
Epoch 1/1, Batch Loss: 1.5985292196273804, Average Training Loss: 1.6009424527486165, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.6468724012374878, Average Training Loss: 1.6075038739613123, Training Accuracy: 0.25892857142857145
Epoch 1/1, Batch Loss: 1.5557491779327393, Average Training Loss: 1.6010345369577408, Training Accuracy: 0.265625
Epoch 1/1, Batch Loss: 1.566765546798706, Average Training Loss: 1.5972268713845148, Training Accu

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6917293233082706
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.73      0.65      0.69       132
                Educational Opportunity       0.45      0.70      0.55       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.75      0.75      0.75       130
                 Program Implementation       0.76      0.39      0.51       132

                               accuracy                           0.69       665
                              macro avg       0.73      0.69      0.69       665
                           weighted avg       0.72      0.69      0.69       665

Test Confusion Matrix:
[[ 86  34   3   5   4]
 [ 18  96   4  14   6]
 [  1   1 130   1   0]
 [  7  19   1  97   6]
 [  6  62   0  13  51]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted15.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.5881491899490356, Average Training Loss: 1.5881491899490356, Training Accuracy: 0.28125
Epoch 1/1, Batch Loss: 1.5733705759048462, Average Training Loss: 1.580759882926941, Training Accuracy: 0.265625
Epoch 1/1, Batch Loss: 1.6108579635620117, Average Training Loss: 1.5907925764719646, Training Accuracy: 0.2604166666666667
Epoch 1/1, Batch Loss: 1.6356772184371948, Average Training Loss: 1.602013736963272, Training Accuracy: 0.2421875
Epoch 1/1, Batch Loss: 1.6453213691711426, Average Training Loss: 1.6106752634048462, Training Accuracy: 0.2375
Epoch 1/1, Batch Loss: 1.6423978805541992, Average Training Loss: 1.6159623662630718, Training Accuracy: 0.22916666666666666
Epoch 1/1, Batch Loss: 1.6407169103622437, Average Training Loss: 1.6194987297058105, Training Accuracy: 0.22767857142857142
Epoch 1/1, Batch Loss: 1.617627501487732, Average Training Loss: 1.6192648261785507, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.6081550121307373, Average Training L

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6421052631578947
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.63      0.52      0.57       132
                Educational Opportunity       0.40      0.58      0.47       138
                         Family Support       0.93      0.96      0.95       133
                      Financial Support       0.70      0.72      0.71       130
                 Program Implementation       0.68      0.42      0.52       132

                               accuracy                           0.64       665
                              macro avg       0.67      0.64      0.64       665
                           weighted avg       0.67      0.64      0.64       665

Test Confusion Matrix:
[[ 69  38   3  10  12]
 [ 25  80   5  15  13]
 [  1   1 128   3   0]
 [  2  32   1  94   1]
 [ 13  50   0  13  56]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted16.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Batch Loss: 1.6249359846115112, Average Training Loss: 1.6249359846115112, Training Accuracy: 0.1875
Epoch 1/1, Batch Loss: 1.589375615119934, Average Training Loss: 1.6071557998657227, Training Accuracy: 0.234375
Epoch 1/1, Batch Loss: 1.619322657585144, Average Training Loss: 1.6112114191055298, Training Accuracy: 0.21875
Epoch 1/1, Batch Loss: 1.6122303009033203, Average Training Loss: 1.6114661395549774, Training Accuracy: 0.2265625
Epoch 1/1, Batch Loss: 1.656581163406372, Average Training Loss: 1.6204891443252563, Training Accuracy: 0.2125
Epoch 1/1, Batch Loss: 1.6808727979660034, Average Training Loss: 1.6305530865987141, Training Accuracy: 0.20833333333333334
Epoch 1/1, Batch Loss: 1.5755586624145508, Average Training Loss: 1.6226967402866908, Training Accuracy: 0.20535714285714285
Epoch 1/1, Batch Loss: 1.6356115341186523, Average Training Loss: 1.624311089515686, Training Accuracy: 0.2109375
Epoch 1/1, Batch Loss: 1.6598323583602905, Average Training Loss: 1.62825

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6721804511278195
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.55      0.62       132
                Educational Opportunity       0.43      0.62      0.51       138
                         Family Support       0.95      0.98      0.97       133
                      Financial Support       0.72      0.78      0.75       130
                 Program Implementation       0.68      0.43      0.53       132

                               accuracy                           0.67       665
                              macro avg       0.70      0.67      0.67       665
                           weighted avg       0.69      0.67      0.67       665

Test Confusion Matrix:
[[ 72  37   2   8  13]
 [ 18  86   4  18  12]
 [  0   0 131   2   0]
 [  2  24   1 101   2]
 [ 10  54   0  11  57]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted17.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6570161581039429, Average Training Loss: 1.6570161581039429, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.609397530555725, Average Training Loss: 1.633206844329834, Training Accuracy: 0.09375
Epoch 1/3, Batch Loss: 1.6207283735275269, Average Training Loss: 1.6290473540623982, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.5915488004684448, Average Training Loss: 1.61967271566391, Training Accuracy: 0.171875
Epoch 1/3, Batch Loss: 1.6227022409439087, Average Training Loss: 1.6202786207199096, Training Accuracy: 0.1625
Epoch 1/3, Batch Loss: 1.5877214670181274, Average Training Loss: 1.6148524284362793, Training Accuracy: 0.17708333333333334
Epoch 1/3, Batch Loss: 1.5969524383544922, Average Training Loss: 1.612295286996024, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6035126447677612, Average Training Loss: 1.6111974567174911, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6039525270462036, Average Training Loss: 1.6103924645317926, Training 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7368421052631579
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.76      0.74       132
                Educational Opportunity       0.53      0.59      0.56       138
                         Family Support       0.95      0.98      0.97       133
                      Financial Support       0.75      0.83      0.79       130
                 Program Implementation       0.79      0.52      0.63       132

                               accuracy                           0.74       665
                              macro avg       0.75      0.74      0.74       665
                           weighted avg       0.74      0.74      0.73       665

Test Confusion Matrix:
[[100  17   3   6   6]
 [ 27  82   4  17   8]
 [  1   0 131   1   0]
 [  6  12   0 108   4]
 [  6  45   0  12  69]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted18.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6282110214233398, Average Training Loss: 1.6282110214233398, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.6193804740905762, Average Training Loss: 1.623795747756958, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.599700927734375, Average Training Loss: 1.6157641410827637, Training Accuracy: 0.16666666666666666
Epoch 1/3, Batch Loss: 1.657067894935608, Average Training Loss: 1.6260900795459747, Training Accuracy: 0.171875
Epoch 1/3, Batch Loss: 1.5206645727157593, Average Training Loss: 1.6050049781799316, Training Accuracy: 0.2125
Epoch 1/3, Batch Loss: 1.5768462419509888, Average Training Loss: 1.600311855475108, Training Accuracy: 0.20833333333333334
Epoch 1/3, Batch Loss: 1.7170653343200684, Average Training Loss: 1.6169909238815308, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.576645851135254, Average Training Loss: 1.6119477897882462, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.5854713916778564, Average Training Loss: 1.609005967775980

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7142857142857143
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.70      0.69       132
                Educational Opportunity       0.51      0.57      0.54       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.73      0.82      0.77       130
                 Program Implementation       0.74      0.52      0.61       132

                               accuracy                           0.71       665
                              macro avg       0.72      0.72      0.71       665
                           weighted avg       0.72      0.71      0.71       665

Test Confusion Matrix:
[[ 93  19   2   8  10]
 [ 30  78   4  18   8]
 [  0   3 129   1   0]
 [  3  13   1 107   6]
 [ 12  39   0  13  68]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted19.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.642566442489624, Average Training Loss: 1.642566442489624, Training Accuracy: 0.15625
Epoch 1/3, Batch Loss: 1.6079468727111816, Average Training Loss: 1.6252566576004028, Training Accuracy: 0.171875
Epoch 1/3, Batch Loss: 1.5938184261322021, Average Training Loss: 1.6147772471110027, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.594473123550415, Average Training Loss: 1.6097012162208557, Training Accuracy: 0.1953125
Epoch 1/3, Batch Loss: 1.630443811416626, Average Training Loss: 1.6138497352600099, Training Accuracy: 0.19375
Epoch 1/3, Batch Loss: 1.6368049383163452, Average Training Loss: 1.6176756024360657, Training Accuracy: 0.19270833333333334
Epoch 1/3, Batch Loss: 1.6744343042373657, Average Training Loss: 1.62578398840768, Training Accuracy: 0.20089285714285715
Epoch 1/3, Batch Loss: 1.596729040145874, Average Training Loss: 1.6221521198749542, Training Accuracy: 0.19921875
Epoch 1/3, Batch Loss: 1.6347728967666626, Average Training Loss: 1.623554

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7383458646616541
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.74      0.76      0.75       132
                Educational Opportunity       0.55      0.62      0.58       138
                         Family Support       0.93      0.97      0.95       133
                      Financial Support       0.74      0.78      0.76       130
                 Program Implementation       0.75      0.58      0.65       132

                               accuracy                           0.74       665
                              macro avg       0.74      0.74      0.74       665
                           weighted avg       0.74      0.74      0.74       665

Test Confusion Matrix:
[[100  14   3   6   9]
 [ 25  85   5  15   8]
 [  0   2 129   2   0]
 [  4  16   1 101   8]
 [  7  37   0  12  76]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted20.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6259440183639526, Average Training Loss: 1.6259440183639526, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.6020478010177612, Average Training Loss: 1.613995909690857, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.5916681289672852, Average Training Loss: 1.606553316116333, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.6154310703277588, Average Training Loss: 1.6087727546691895, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.5943890810012817, Average Training Loss: 1.6058960199356078, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.6703968048095703, Average Training Loss: 1.616646150747935, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.517544150352478, Average Training Loss: 1.6024887221200126, Training Accuracy: 0.23660714285714285
Epoch 1/3, Batch Loss: 1.5460405349731445, Average Training Loss: 1.595432698726654, Training Accuracy: 0.23828125
Epoch 1/3, Batch Loss: 1.5860705375671387, Average Training Loss

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7293233082706767
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.77      0.73       132
                Educational Opportunity       0.54      0.61      0.57       138
                         Family Support       0.93      0.95      0.94       133
                      Financial Support       0.74      0.78      0.76       130
                 Program Implementation       0.80      0.53      0.64       132

                               accuracy                           0.73       665
                              macro avg       0.74      0.73      0.73       665
                           weighted avg       0.74      0.73      0.73       665

Test Confusion Matrix:
[[102  15   3   6   6]
 [ 27  84   5  15   7]
 [  0   3 127   3   0]
 [  5  17   2 102   4]
 [ 13  38   0  11  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted21.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.5856359004974365, Average Training Loss: 1.5856359004974365, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.613193392753601, Average Training Loss: 1.5994146466255188, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.618704080581665, Average Training Loss: 1.6058444579442341, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.6243730783462524, Average Training Loss: 1.6104766130447388, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.6768686771392822, Average Training Loss: 1.6237550258636475, Training Accuracy: 0.225
Epoch 1/3, Batch Loss: 1.613843321800232, Average Training Loss: 1.6221030751864116, Training Accuracy: 0.23958333333333334
Epoch 1/3, Batch Loss: 1.6561936140060425, Average Training Loss: 1.6269731521606445, Training Accuracy: 0.21428571428571427
Epoch 1/3, Batch Loss: 1.6469957828521729, Average Training Loss: 1.6294759809970856, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.6318553686141968, Average Training Loss: 1.629740357398986

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.67      0.68       132
                Educational Opportunity       0.51      0.64      0.57       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.70      0.82      0.76       130
                 Program Implementation       0.81      0.49      0.61       132

                               accuracy                           0.72       665
                              macro avg       0.74      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 88  27   2   7   8]
 [ 22  88   4  20   4]
 [  0   1 129   3   0]
 [  4  15   1 107   3]
 [ 11  40   0  16  65]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted22.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6094247102737427, Average Training Loss: 1.6094247102737427, Training Accuracy: 0.3125
Epoch 1/3, Batch Loss: 1.6118377447128296, Average Training Loss: 1.6106312274932861, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.550365686416626, Average Training Loss: 1.5905427138010662, Training Accuracy: 0.2708333333333333
Epoch 1/3, Batch Loss: 1.600555181503296, Average Training Loss: 1.5930458307266235, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.6047080755233765, Average Training Loss: 1.5953782796859741, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.5362164974212646, Average Training Loss: 1.585517982641856, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.5206923484802246, Average Training Loss: 1.5762571777616228, Training Accuracy: 0.2767857142857143
Epoch 1/3, Batch Loss: 1.513662338256836, Average Training Loss: 1.5684328228235245, Training Accuracy: 0.2890625
Epoch 1/3, Batch Loss: 1.7029988765716553, Average Training Loss: 1.5833846065733168, Tra

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7233082706766917
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.73      0.70       132
                Educational Opportunity       0.53      0.57      0.55       138
                         Family Support       0.93      0.97      0.95       133
                      Financial Support       0.74      0.82      0.77       130
                 Program Implementation       0.77      0.54      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 96  18   3   7   8]
 [ 29  79   4  18   8]
 [  0   3 129   1   0]
 [  6  11   2 106   5]
 [ 11  38   0  12  71]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted23.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6308635473251343, Average Training Loss: 1.6308635473251343, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.597252368927002, Average Training Loss: 1.6140579581260681, Training Accuracy: 0.203125
Epoch 1/3, Batch Loss: 1.5883383750915527, Average Training Loss: 1.6054847637812297, Training Accuracy: 0.19791666666666666
Epoch 1/3, Batch Loss: 1.6354916095733643, Average Training Loss: 1.6129864752292633, Training Accuracy: 0.1953125
Epoch 1/3, Batch Loss: 1.617932677268982, Average Training Loss: 1.613975715637207, Training Accuracy: 0.19375
Epoch 1/3, Batch Loss: 1.6549928188323975, Average Training Loss: 1.620811899503072, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.6215876340866089, Average Training Loss: 1.6209227187292916, Training Accuracy: 0.18303571428571427
Epoch 1/3, Batch Loss: 1.5781702995300293, Average Training Loss: 1.6155786663293839, Training Accuracy: 0.1796875
Epoch 1/3, Batch Loss: 1.6077533960342407, Average Training Loss: 1.6147

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7263157894736842
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.74      0.73       132
                Educational Opportunity       0.52      0.59      0.55       138
                         Family Support       0.93      0.98      0.96       133
                      Financial Support       0.73      0.79      0.76       130
                 Program Implementation       0.78      0.53      0.63       132

                               accuracy                           0.73       665
                              macro avg       0.73      0.73      0.73       665
                           weighted avg       0.73      0.73      0.72       665

Test Confusion Matrix:
[[ 98  18   3   6   7]
 [ 28  81   5  17   7]
 [  0   0 131   2   0]
 [  3  16   2 103   6]
 [  8  41   0  13  70]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted24.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.5795568227767944, Average Training Loss: 1.5795568227767944, Training Accuracy: 0.28125
Epoch 1/3, Batch Loss: 1.5969889163970947, Average Training Loss: 1.5882728695869446, Training Accuracy: 0.328125
Epoch 1/3, Batch Loss: 1.6296427249908447, Average Training Loss: 1.6020628213882446, Training Accuracy: 0.2708333333333333
Epoch 1/3, Batch Loss: 1.630944848060608, Average Training Loss: 1.6092833280563354, Training Accuracy: 0.25
Epoch 1/3, Batch Loss: 1.6183459758758545, Average Training Loss: 1.6110958576202392, Training Accuracy: 0.2375
Epoch 1/3, Batch Loss: 1.5901316404342651, Average Training Loss: 1.607601821422577, Training Accuracy: 0.22395833333333334
Epoch 1/3, Batch Loss: 1.6231938600540161, Average Training Loss: 1.6098292555127824, Training Accuracy: 0.20982142857142858
Epoch 1/3, Batch Loss: 1.6269474029541016, Average Training Loss: 1.6119690239429474, Training Accuracy: 0.2109375
Epoch 1/3, Batch Loss: 1.6147394180297852, Average Training Loss

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.61      0.66       132
                Educational Opportunity       0.50      0.65      0.57       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.75      0.80      0.78       130
                 Program Implementation       0.74      0.55      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 81  29   2   7  13]
 [ 22  90   4  14   8]
 [  0   2 129   2   0]
 [  4  17   1 104   4]
 [  6  42   0  11  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted25.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.635002851486206, Average Training Loss: 1.635002851486206, Training Accuracy: 0.0625
Epoch 1/3, Batch Loss: 1.6348408460617065, Average Training Loss: 1.6349218487739563, Training Accuracy: 0.09375
Epoch 1/3, Batch Loss: 1.6293243169784546, Average Training Loss: 1.6330560048421223, Training Accuracy: 0.16666666666666666
Epoch 1/3, Batch Loss: 1.6771860122680664, Average Training Loss: 1.6440885066986084, Training Accuracy: 0.171875
Epoch 1/3, Batch Loss: 1.619742751121521, Average Training Loss: 1.6392193555831909, Training Accuracy: 0.175
Epoch 1/3, Batch Loss: 1.5798568725585938, Average Training Loss: 1.6293256084124248, Training Accuracy: 0.19791666666666666
Epoch 1/3, Batch Loss: 1.5723830461502075, Average Training Loss: 1.6211909566606795, Training Accuracy: 0.21428571428571427
Epoch 1/3, Batch Loss: 1.5879602432250977, Average Training Loss: 1.6170371174812317, Training Accuracy: 0.203125
Epoch 1/3, Batch Loss: 1.599117636680603, Average Training Loss:

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.66      0.67       132
                Educational Opportunity       0.51      0.62      0.56       138
                         Family Support       0.96      0.98      0.97       133
                      Financial Support       0.75      0.78      0.77       130
                 Program Implementation       0.74      0.55      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 87  24   2   7  12]
 [ 28  85   4  14   7]
 [  0   0 131   2   0]
 [  5  17   0 102   6]
 [  9  40   0  11  72]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted26.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.601636528968811, Average Training Loss: 1.601636528968811, Training Accuracy: 0.125
Epoch 1/3, Batch Loss: 1.5948681831359863, Average Training Loss: 1.5982523560523987, Training Accuracy: 0.1875
Epoch 1/3, Batch Loss: 1.640891432762146, Average Training Loss: 1.6124653816223145, Training Accuracy: 0.16666666666666666
Epoch 1/3, Batch Loss: 1.6161798238754272, Average Training Loss: 1.6133939921855927, Training Accuracy: 0.171875
Epoch 1/3, Batch Loss: 1.672215223312378, Average Training Loss: 1.6251582384109498, Training Accuracy: 0.1625
Epoch 1/3, Batch Loss: 1.6050691604614258, Average Training Loss: 1.6218100587526958, Training Accuracy: 0.16666666666666666
Epoch 1/3, Batch Loss: 1.598861813545227, Average Training Loss: 1.6185317380087716, Training Accuracy: 0.16964285714285715
Epoch 1/3, Batch Loss: 1.6188796758651733, Average Training Loss: 1.6185752302408218, Training Accuracy: 0.15625
Epoch 1/3, Batch Loss: 1.613337755203247, Average Training Loss: 1.6

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7127819548872181
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.65      0.68      0.67       132
                Educational Opportunity       0.53      0.59      0.56       138
                         Family Support       0.94      0.97      0.96       133
                      Financial Support       0.73      0.82      0.77       130
                 Program Implementation       0.76      0.51      0.61       132

                               accuracy                           0.71       665
                              macro avg       0.72      0.71      0.71       665
                           weighted avg       0.72      0.71      0.71       665

Test Confusion Matrix:
[[ 90  21   2   8  11]
 [ 28  82   5  16   7]
 [  1   1 129   2   0]
 [  4  16   1 106   3]
 [ 15  36   0  14  67]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted27.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6495100259780884, Average Training Loss: 1.6495100259780884, Training Accuracy: 0.09375
Epoch 1/3, Batch Loss: 1.641110897064209, Average Training Loss: 1.6453104615211487, Training Accuracy: 0.15625
Epoch 1/3, Batch Loss: 1.6141377687454224, Average Training Loss: 1.63491956392924, Training Accuracy: 0.13541666666666666
Epoch 1/3, Batch Loss: 1.5771797895431519, Average Training Loss: 1.620484620332718, Training Accuracy: 0.1484375
Epoch 1/3, Batch Loss: 1.5813910961151123, Average Training Loss: 1.6126659154891967, Training Accuracy: 0.16875
Epoch 1/3, Batch Loss: 1.606321930885315, Average Training Loss: 1.611608584721883, Training Accuracy: 0.18229166666666666
Epoch 1/3, Batch Loss: 1.6802923679351807, Average Training Loss: 1.6214205537523543, Training Accuracy: 0.17857142857142858
Epoch 1/3, Batch Loss: 1.6389449834823608, Average Training Loss: 1.623611107468605, Training Accuracy: 0.17578125
Epoch 1/3, Batch Loss: 1.6323614120483398, Average Training Lo

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.6827067669172933
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.59      0.63       132
                Educational Opportunity       0.45      0.63      0.53       138
                         Family Support       0.93      0.98      0.95       133
                      Financial Support       0.74      0.74      0.74       130
                 Program Implementation       0.73      0.48      0.58       132

                               accuracy                           0.68       665
                              macro avg       0.70      0.68      0.68       665
                           weighted avg       0.70      0.68      0.68       665

Test Confusion Matrix:
[[ 78  36   3   7   8]
 [ 25  87   5  13   8]
 [  0   1 130   2   0]
 [  4  21   2  96   7]
 [ 10  48   0  11  63]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted28.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 1.6180466413497925, Average Training Loss: 1.6180466413497925, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.590173602104187, Average Training Loss: 1.6041101217269897, Training Accuracy: 0.21875
Epoch 1/3, Batch Loss: 1.6048444509506226, Average Training Loss: 1.6043548981348674, Training Accuracy: 0.20833333333333334
Epoch 1/3, Batch Loss: 1.574686050415039, Average Training Loss: 1.5969376862049103, Training Accuracy: 0.2265625
Epoch 1/3, Batch Loss: 1.5806584358215332, Average Training Loss: 1.5936818361282348, Training Accuracy: 0.24375
Epoch 1/3, Batch Loss: 1.6004023551940918, Average Training Loss: 1.594801922639211, Training Accuracy: 0.22916666666666666
Epoch 1/3, Batch Loss: 1.6051608324050903, Average Training Loss: 1.5962817668914795, Training Accuracy: 0.23214285714285715
Epoch 1/3, Batch Loss: 1.6179901361465454, Average Training Loss: 1.5989953130483627, Training Accuracy: 0.234375
Epoch 1/3, Batch Loss: 1.606629490852356, Average Training L

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7097744360902256
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.67      0.67       132
                Educational Opportunity       0.51      0.59      0.54       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.72      0.80      0.76       130
                 Program Implementation       0.73      0.52      0.61       132

                               accuracy                           0.71       665
                              macro avg       0.72      0.71      0.71       665
                           weighted avg       0.72      0.71      0.71       665

Test Confusion Matrix:
[[ 88  21   2  10  11]
 [ 25  81   5  17  10]
 [  0   1 130   2   0]
 [  5  16   1 104   4]
 [ 11  41   0  11  69]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted5e.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6353920698165894, Average Training Loss: 1.6353920698165894, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.6205384731292725, Average Training Loss: 1.627965271472931, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.574173927307129, Average Training Loss: 1.6100348234176636, Training Accuracy: 0.22916666666666666
Epoch 1/5, Batch Loss: 1.6044992208480835, Average Training Loss: 1.6086509227752686, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.6366826295852661, Average Training Loss: 1.614257264137268, Training Accuracy: 0.2
Epoch 1/5, Batch Loss: 1.6120905876159668, Average Training Loss: 1.6138961513837178, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.6367210149765015, Average Training Loss: 1.6171568461826868, Training Accuracy: 0.20535714285714285
Epoch 1/5, Batch Loss: 1.6168023347854614, Average Training Loss: 1.6171125322580338, Training Accuracy: 0.1953125
Epoch 1/5, Batch Loss: 1.6063843965530396, Average Training Loss: 1.6159205171

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.724812030075188
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.71      0.71       132
                Educational Opportunity       0.52      0.60      0.56       138
                         Family Support       0.94      0.96      0.95       133
                      Financial Support       0.73      0.82      0.77       130
                 Program Implementation       0.79      0.54      0.64       132

                               accuracy                           0.72       665
                              macro avg       0.74      0.73      0.73       665
                           weighted avg       0.74      0.72      0.72       665

Test Confusion Matrix:
[[ 94  20   3   8   7]
 [ 26  83   5  17   7]
 [  0   2 128   3   0]
 [  5  14   0 106   5]
 [  7  42   0  12  71]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/roB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted30.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.5820727348327637, Average Training Loss: 1.5820727348327637, Training Accuracy: 0.375
Epoch 1/5, Batch Loss: 1.6360141038894653, Average Training Loss: 1.6090434193611145, Training Accuracy: 0.28125
Epoch 1/5, Batch Loss: 1.616735816001892, Average Training Loss: 1.611607551574707, Training Accuracy: 0.22916666666666666
Epoch 1/5, Batch Loss: 1.548797845840454, Average Training Loss: 1.5959051251411438, Training Accuracy: 0.28125
Epoch 1/5, Batch Loss: 1.636478304862976, Average Training Loss: 1.6040197610855103, Training Accuracy: 0.2625
Epoch 1/5, Batch Loss: 1.6099003553390503, Average Training Loss: 1.6049998601277669, Training Accuracy: 0.2708333333333333
Epoch 1/5, Batch Loss: 1.5980877876281738, Average Training Loss: 1.6040124211992537, Training Accuracy: 0.26785714285714285
Epoch 1/5, Batch Loss: 1.6411020755767822, Average Training Loss: 1.6086486279964447, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.6402037143707275, Average Training Loss: 1.612

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.718796992481203
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.70      0.70       132
                Educational Opportunity       0.51      0.62      0.56       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.75      0.78      0.76       130
                 Program Implementation       0.75      0.52      0.62       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 93  22   3   6   8]
 [ 24  85   5  15   9]
 [  0   1 130   2   0]
 [  4  18   1 101   6]
 [ 12  40   0  11  69]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/roB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted31.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6300774812698364, Average Training Loss: 1.6300774812698364, Training Accuracy: 0.09375
Epoch 1/5, Batch Loss: 1.5924720764160156, Average Training Loss: 1.611274778842926, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.61381196975708, Average Training Loss: 1.612120509147644, Training Accuracy: 0.16666666666666666
Epoch 1/5, Batch Loss: 1.6257092952728271, Average Training Loss: 1.6155177056789398, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.6255329847335815, Average Training Loss: 1.6175207614898681, Training Accuracy: 0.19375
Epoch 1/5, Batch Loss: 1.598692536354065, Average Training Loss: 1.6143827239672344, Training Accuracy: 0.20833333333333334
Epoch 1/5, Batch Loss: 1.619109869003296, Average Training Loss: 1.6150580304009574, Training Accuracy: 0.21428571428571427
Epoch 1/5, Batch Loss: 1.5918906927108765, Average Training Loss: 1.6121621131896973, Training Accuracy: 0.23046875
Epoch 1/5, Batch Loss: 1.6008870601654053, Average Training Los

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7007518796992481
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.63      0.65       132
                Educational Opportunity       0.47      0.62      0.54       138
                         Family Support       0.95      0.98      0.96       133
                      Financial Support       0.77      0.75      0.76       130
                 Program Implementation       0.73      0.52      0.61       132

                               accuracy                           0.70       665
                              macro avg       0.72      0.70      0.70       665
                           weighted avg       0.72      0.70      0.70       665

Test Confusion Matrix:
[[ 83  30   2   7  10]
 [ 28  86   4  11   9]
 [  1   1 130   1   0]
 [  3  22   1  98   6]
 [  8  44   0  11  69]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted32.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6043076515197754, Average Training Loss: 1.6043076515197754, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.614378809928894, Average Training Loss: 1.6093432307243347, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.6344209909439087, Average Training Loss: 1.6177024841308594, Training Accuracy: 0.19791666666666666
Epoch 1/5, Batch Loss: 1.5915584564208984, Average Training Loss: 1.6111664772033691, Training Accuracy: 0.1796875
Epoch 1/5, Batch Loss: 1.6175187826156616, Average Training Loss: 1.6124369382858277, Training Accuracy: 0.19375
Epoch 1/5, Batch Loss: 1.6490650177001953, Average Training Loss: 1.6185416181882222, Training Accuracy: 0.18229166666666666
Epoch 1/5, Batch Loss: 1.6286643743515015, Average Training Loss: 1.6199877262115479, Training Accuracy: 0.16517857142857142
Epoch 1/5, Batch Loss: 1.6060888767242432, Average Training Loss: 1.6182503700256348, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6286450624465942, Average Training Lo

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.718796992481203
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.70      0.69       132
                Educational Opportunity       0.52      0.62      0.57       138
                         Family Support       0.93      0.97      0.95       133
                      Financial Support       0.74      0.75      0.75       130
                 Program Implementation       0.78      0.55      0.64       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 93  22   3   6   8]
 [ 25  86   5  15   7]
 [  0   1 129   3   0]
 [  6  19   2  98   5]
 [ 13  36   0  11  72]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/roB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted33.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.622367262840271, Average Training Loss: 1.622367262840271, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.6109362840652466, Average Training Loss: 1.6166517734527588, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.577920913696289, Average Training Loss: 1.6037414868672688, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.569050908088684, Average Training Loss: 1.5950688421726227, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.626930594444275, Average Training Loss: 1.6014411926269532, Training Accuracy: 0.225
Epoch 1/5, Batch Loss: 1.582282304763794, Average Training Loss: 1.59824804464976, Training Accuracy: 0.2604166666666667
Epoch 1/5, Batch Loss: 1.6283079385757446, Average Training Loss: 1.6025423152106149, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.6717119216918945, Average Training Loss: 1.6111885160207748, Training Accuracy: 0.2265625
Epoch 1/5, Batch Loss: 1.655959129333496, Average Training Loss: 1.6161630286110773, Training Accur

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7398496240601504
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.73      0.72       132
                Educational Opportunity       0.55      0.57      0.56       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.75      0.85      0.79       130
                 Program Implementation       0.75      0.58      0.65       132

                               accuracy                           0.74       665
                              macro avg       0.74      0.74      0.74       665
                           weighted avg       0.74      0.74      0.74       665

Test Confusion Matrix:
[[ 97  18   2   6   9]
 [ 26  79   5  18  10]
 [  1   1 130   1   0]
 [  3  10   1 110   6]
 [  9  35   0  12  76]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted34.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.623993158340454, Average Training Loss: 1.623993158340454, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.5864171981811523, Average Training Loss: 1.6052051782608032, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6388429403305054, Average Training Loss: 1.6164177656173706, Training Accuracy: 0.16666666666666666
Epoch 1/5, Batch Loss: 1.5374290943145752, Average Training Loss: 1.5966705977916718, Training Accuracy: 0.21875
Epoch 1/5, Batch Loss: 1.677470326423645, Average Training Loss: 1.6128305435180663, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.631104588508606, Average Training Loss: 1.6158762176831563, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5988798141479492, Average Training Loss: 1.6134481600352697, Training Accuracy: 0.19642857142857142
Epoch 1/5, Batch Loss: 1.6069949865341187, Average Training Loss: 1.6126415133476257, Training Accuracy: 0.1796875
Epoch 1/5, Batch Loss: 1.5583237409591675, Average Training Loss: 1.60660620530

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.75      0.71       132
                Educational Opportunity       0.54      0.59      0.56       138
                         Family Support       0.94      0.97      0.96       133
                      Financial Support       0.72      0.75      0.73       130
                 Program Implementation       0.74      0.55      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 99  17   3   6   7]
 [ 29  81   3  16   9]
 [  1   1 129   2   0]
 [  6  16   2  97   9]
 [ 10  36   0  13  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted35.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6111150979995728, Average Training Loss: 1.6111150979995728, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6314647197723389, Average Training Loss: 1.6212899088859558, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.5960980653762817, Average Training Loss: 1.6128926277160645, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.601826548576355, Average Training Loss: 1.610126107931137, Training Accuracy: 0.140625
Epoch 1/5, Batch Loss: 1.6279802322387695, Average Training Loss: 1.6136969327926636, Training Accuracy: 0.14375
Epoch 1/5, Batch Loss: 1.592655062675476, Average Training Loss: 1.610189954439799, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.6020276546478271, Average Training Loss: 1.6090239116123743, Training Accuracy: 0.16964285714285715
Epoch 1/5, Batch Loss: 1.5783030986785889, Average Training Loss: 1.6051838099956512, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5709354877471924, Average Training Loss: 1.6013784408569336, Trai

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.71      0.71       132
                Educational Opportunity       0.51      0.60      0.55       138
                         Family Support       0.95      0.95      0.95       133
                      Financial Support       0.71      0.79      0.75       130
                 Program Implementation       0.81      0.55      0.65       132

                               accuracy                           0.72       665
                              macro avg       0.74      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 94  18   2  11   7]
 [ 27  83   3  18   7]
 [  0   3 127   3   0]
 [  6  17   1 103   3]
 [  7  42   0  11  72]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted36.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6353713274002075, Average Training Loss: 1.6353713274002075, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5981881618499756, Average Training Loss: 1.6167797446250916, Training Accuracy: 0.203125
Epoch 1/5, Batch Loss: 1.612347960472107, Average Training Loss: 1.6153024832407634, Training Accuracy: 0.20833333333333334
Epoch 1/5, Batch Loss: 1.6134154796600342, Average Training Loss: 1.614830732345581, Training Accuracy: 0.1953125
Epoch 1/5, Batch Loss: 1.5853869915008545, Average Training Loss: 1.6089419841766357, Training Accuracy: 0.2125
Epoch 1/5, Batch Loss: 1.5939133167266846, Average Training Loss: 1.6064372062683105, Training Accuracy: 0.22395833333333334
Epoch 1/5, Batch Loss: 1.593114972114563, Average Training Loss: 1.6045340299606323, Training Accuracy: 0.22767857142857142
Epoch 1/5, Batch Loss: 1.6274830102920532, Average Training Loss: 1.60740265250206, Training Accuracy: 0.21484375
Epoch 1/5, Batch Loss: 1.5663745403289795, Average Training L

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7308270676691729
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.73      0.73       132
                Educational Opportunity       0.54      0.61      0.57       138
                         Family Support       0.93      0.97      0.95       133
                      Financial Support       0.73      0.79      0.76       130
                 Program Implementation       0.77      0.55      0.64       132

                               accuracy                           0.73       665
                              macro avg       0.74      0.73      0.73       665
                           weighted avg       0.74      0.73      0.73       665

Test Confusion Matrix:
[[ 97  17   2   8   8]
 [ 25  84   5  16   8]
 [  0   2 129   2   0]
 [  3  16   2 103   6]
 [  9  37   0  13  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted37.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.5946651697158813, Average Training Loss: 1.5946651697158813, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.5742368698120117, Average Training Loss: 1.5844510197639465, Training Accuracy: 0.3125
Epoch 1/5, Batch Loss: 1.6204313039779663, Average Training Loss: 1.5964444478352864, Training Accuracy: 0.2708333333333333
Epoch 1/5, Batch Loss: 1.6742076873779297, Average Training Loss: 1.6158852577209473, Training Accuracy: 0.234375
Epoch 1/5, Batch Loss: 1.6886385679244995, Average Training Loss: 1.6304359197616578, Training Accuracy: 0.2125
Epoch 1/5, Batch Loss: 1.5875781774520874, Average Training Loss: 1.6232929627100627, Training Accuracy: 0.20833333333333334
Epoch 1/5, Batch Loss: 1.6409482955932617, Average Training Loss: 1.6258151531219482, Training Accuracy: 0.19642857142857142
Epoch 1/5, Batch Loss: 1.5629857778549194, Average Training Loss: 1.6179614812135696, Training Accuracy: 0.2109375
Epoch 1/5, Batch Loss: 1.6273703575134277, Average Training Los

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.75      0.71       132
                Educational Opportunity       0.53      0.56      0.54       138
                         Family Support       0.95      0.95      0.95       133
                      Financial Support       0.73      0.80      0.76       130
                 Program Implementation       0.73      0.55      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.72      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 99  14   2   6  11]
 [ 32  77   4  18   7]
 [  2   3 127   1   0]
 [  4  13   1 104   8]
 [  9  38   0  13  72]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted38.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6574302911758423, Average Training Loss: 1.6574302911758423, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5605179071426392, Average Training Loss: 1.6089740991592407, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.625536561012268, Average Training Loss: 1.6144949197769165, Training Accuracy: 0.14583333333333334
Epoch 1/5, Batch Loss: 1.727829933166504, Average Training Loss: 1.6428286731243134, Training Accuracy: 0.140625
Epoch 1/5, Batch Loss: 1.5947781801223755, Average Training Loss: 1.6332185745239258, Training Accuracy: 0.1625
Epoch 1/5, Batch Loss: 1.6126015186309814, Average Training Loss: 1.6297823985417683, Training Accuracy: 0.16666666666666666
Epoch 1/5, Batch Loss: 1.636072039604187, Average Training Loss: 1.6306809186935425, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.645829677581787, Average Training Loss: 1.632574513554573, Training Accuracy: 0.1796875
Epoch 1/5, Batch Loss: 1.6260509490966797, Average Training Loss: 1.6318496730

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7263157894736842
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.67      0.73      0.70       132
                Educational Opportunity       0.55      0.53      0.54       138
                         Family Support       0.93      0.96      0.95       133
                      Financial Support       0.72      0.82      0.77       130
                 Program Implementation       0.75      0.60      0.67       132

                               accuracy                           0.73       665
                              macro avg       0.73      0.73      0.72       665
                           weighted avg       0.73      0.73      0.72       665

Test Confusion Matrix:
[[ 96  17   3   8   8]
 [ 30  73   4  17  14]
 [  1   2 128   2   0]
 [  5  12   2 107   4]
 [ 11  28   0  14  79]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted39.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6321228742599487, Average Training Loss: 1.6321228742599487, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.596470594406128, Average Training Loss: 1.6142967343330383, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.6557081937789917, Average Training Loss: 1.6281005541483562, Training Accuracy: 0.15625
Epoch 1/5, Batch Loss: 1.615779995918274, Average Training Loss: 1.6250204145908356, Training Accuracy: 0.171875
Epoch 1/5, Batch Loss: 1.6221915483474731, Average Training Loss: 1.624454641342163, Training Accuracy: 0.16875
Epoch 1/5, Batch Loss: 1.612198829650879, Average Training Loss: 1.6224120060602825, Training Accuracy: 0.18229166666666666
Epoch 1/5, Batch Loss: 1.5728060007095337, Average Training Loss: 1.6153254338673182, Training Accuracy: 0.20089285714285715
Epoch 1/5, Batch Loss: 1.5979640483856201, Average Training Loss: 1.613155260682106, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.6443536281585693, Average Training Loss: 1.6166217459

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7293233082706767
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.73      0.72       132
                Educational Opportunity       0.54      0.58      0.56       138
                         Family Support       0.95      0.96      0.96       133
                      Financial Support       0.73      0.80      0.76       130
                 Program Implementation       0.74      0.58      0.65       132

                               accuracy                           0.73       665
                              macro avg       0.73      0.73      0.73       665
                           weighted avg       0.73      0.73      0.73       665

Test Confusion Matrix:
[[ 96  17   3   8   8]
 [ 27  80   4  15  12]
 [  1   2 128   2   0]
 [  4  15   0 104   7]
 [  8  34   0  13  77]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted40.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Batch Loss: 1.6395702362060547, Average Training Loss: 1.6395702362060547, Training Accuracy: 0.125
Epoch 1/5, Batch Loss: 1.605627417564392, Average Training Loss: 1.6225988268852234, Training Accuracy: 0.1875
Epoch 1/5, Batch Loss: 1.5896273851394653, Average Training Loss: 1.611608346303304, Training Accuracy: 0.2604166666666667
Epoch 1/5, Batch Loss: 1.6224020719528198, Average Training Loss: 1.614306777715683, Training Accuracy: 0.25
Epoch 1/5, Batch Loss: 1.6375564336776733, Average Training Loss: 1.618956708908081, Training Accuracy: 0.225
Epoch 1/5, Batch Loss: 1.63625967502594, Average Training Loss: 1.6218405365943909, Training Accuracy: 0.21354166666666666
Epoch 1/5, Batch Loss: 1.590860366821289, Average Training Loss: 1.6174147980553764, Training Accuracy: 0.22321428571428573
Epoch 1/5, Batch Loss: 1.652547836303711, Average Training Loss: 1.6218064278364182, Training Accuracy: 0.21484375
Epoch 1/5, Batch Loss: 1.5876390933990479, Average Training Loss: 1.618010

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7308270676691729
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.73      0.71       132
                Educational Opportunity       0.53      0.63      0.58       138
                         Family Support       0.95      0.96      0.96       133
                      Financial Support       0.75      0.82      0.78       130
                 Program Implementation       0.80      0.52      0.63       132

                               accuracy                           0.73       665
                              macro avg       0.74      0.73      0.73       665
                           weighted avg       0.74      0.73      0.73       665

Test Confusion Matrix:
[[ 97  20   3   6   6]
 [ 27  87   3  15   6]
 [  0   2 128   3   0]
 [  4  14   1 106   5]
 [ 12  41   0  11  68]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted41.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.5948295593261719, Average Training Loss: 1.5948295593261719, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6673213243484497, Average Training Loss: 1.6310754418373108, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6060786247253418, Average Training Loss: 1.6227431694666545, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.642799735069275, Average Training Loss: 1.6277573108673096, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.6157082319259644, Average Training Loss: 1.6253474950790405, Training Accuracy: 0.2375
Epoch 1/8, Batch Loss: 1.629920244216919, Average Training Loss: 1.6261096199353535, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6093757152557373, Average Training Loss: 1.6237190621239799, Training Accuracy: 0.21428571428571427
Epoch 1/8, Batch Loss: 1.6138991117477417, Average Training Loss: 1.62249156832695, Training Accuracy: 0.2265625
Epoch 1/8, Batch Loss: 1.6678998470306396, Average Training Loss: 1.62753693262736, Training 

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7218045112781954
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.73      0.72       132
                Educational Opportunity       0.51      0.58      0.54       138
                         Family Support       0.94      0.98      0.96       133
                      Financial Support       0.71      0.76      0.74       130
                 Program Implementation       0.80      0.55      0.65       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 97  19   2   7   7]
 [ 24  80   5  22   7]
 [  0   1 131   1   0]
 [  5  20   2  99   4]
 [ 12  37   0  10  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted42.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.5810890197753906, Average Training Loss: 1.5810890197753906, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6694945096969604, Average Training Loss: 1.6252917647361755, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.6217432022094727, Average Training Loss: 1.624108910560608, Training Accuracy: 0.16666666666666666
Epoch 1/8, Batch Loss: 1.5623209476470947, Average Training Loss: 1.6086619198322296, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5663557052612305, Average Training Loss: 1.6002006769180297, Training Accuracy: 0.225
Epoch 1/8, Batch Loss: 1.604193925857544, Average Training Loss: 1.6008662184079487, Training Accuracy: 0.20833333333333334
Epoch 1/8, Batch Loss: 1.6163042783737183, Average Training Loss: 1.603071655545916, Training Accuracy: 0.17857142857142858
Epoch 1/8, Batch Loss: 1.5958360433578491, Average Training Loss: 1.6021672040224075, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6101233959197998, Average Training Loss: 1.60

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7368421052631579
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.75      0.72       132
                Educational Opportunity       0.54      0.61      0.57       138
                         Family Support       0.94      0.97      0.96       133
                      Financial Support       0.75      0.80      0.78       130
                 Program Implementation       0.80      0.56      0.66       132

                               accuracy                           0.74       665
                              macro avg       0.75      0.74      0.74       665
                           weighted avg       0.75      0.74      0.74       665

Test Confusion Matrix:
[[ 99  15   2   9   7]
 [ 30  84   5  13   6]
 [  0   1 129   3   0]
 [  5  15   1 104   5]
 [  8  41   0   9  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted43.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6135728359222412, Average Training Loss: 1.6135728359222412, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6286824941635132, Average Training Loss: 1.6211276650428772, Training Accuracy: 0.203125
Epoch 1/8, Batch Loss: 1.6154335737228394, Average Training Loss: 1.6192296346028645, Training Accuracy: 0.19791666666666666
Epoch 1/8, Batch Loss: 1.5682213306427002, Average Training Loss: 1.6064775586128235, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.5936599969863892, Average Training Loss: 1.6039140462875365, Training Accuracy: 0.2
Epoch 1/8, Batch Loss: 1.6188315153121948, Average Training Loss: 1.6064002911249797, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6376852989196777, Average Training Loss: 1.6108695779527937, Training Accuracy: 0.17857142857142858
Epoch 1/8, Batch Loss: 1.624111294746399, Average Training Loss: 1.6125247925519943, Training Accuracy: 0.171875
Epoch 1/8, Batch Loss: 1.6232856512069702, Average Training Loss: 1.61372044351

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7112781954887218
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.72      0.71       132
                Educational Opportunity       0.50      0.56      0.53       138
                         Family Support       0.94      0.97      0.96       133
                      Financial Support       0.72      0.78      0.75       130
                 Program Implementation       0.72      0.54      0.62       132

                               accuracy                           0.71       665
                              macro avg       0.72      0.71      0.71       665
                           weighted avg       0.72      0.71      0.71       665

Test Confusion Matrix:
[[ 95  17   2   9   9]
 [ 28  77   5  16  12]
 [  0   1 129   3   0]
 [  4  18   1 101   6]
 [  9  40   0  12  71]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted44.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.619557499885559, Average Training Loss: 1.619557499885559, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.5853413343429565, Average Training Loss: 1.6024494171142578, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6045371294021606, Average Training Loss: 1.6031453212102253, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6247378587722778, Average Training Loss: 1.6085434556007385, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.6024596691131592, Average Training Loss: 1.6073266983032226, Training Accuracy: 0.1625
Epoch 1/8, Batch Loss: 1.6114484071731567, Average Training Loss: 1.608013649781545, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.6332733631134033, Average Training Loss: 1.6116221802575248, Training Accuracy: 0.16517857142857142
Epoch 1/8, Batch Loss: 1.5861552953720093, Average Training Loss: 1.6084388196468353, Training Accuracy: 0.171875
Epoch 1/8, Batch Loss: 1.6026078462600708, Average Training Loss: 1.6077909337149725, Tra

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7383458646616541
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.73      0.72      0.72       132
                Educational Opportunity       0.54      0.60      0.57       138
                         Family Support       0.96      0.97      0.96       133
                      Financial Support       0.73      0.85      0.79       130
                 Program Implementation       0.79      0.56      0.65       132

                               accuracy                           0.74       665
                              macro avg       0.75      0.74      0.74       665
                           weighted avg       0.75      0.74      0.74       665

Test Confusion Matrix:
[[ 95  21   2   7   7]
 [ 24  83   4  18   9]
 [  0   1 129   3   0]
 [  2  14   0 110   4]
 [ 10  36   0  12  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted45.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6194027662277222, Average Training Loss: 1.6194027662277222, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.59132719039917, Average Training Loss: 1.605364978313446, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6088076829910278, Average Training Loss: 1.6065125465393066, Training Accuracy: 0.22916666666666666
Epoch 1/8, Batch Loss: 1.5884920358657837, Average Training Loss: 1.602007418870926, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.6026240587234497, Average Training Loss: 1.6021307468414308, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.5611572265625, Average Training Loss: 1.5953018267949421, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6187705993652344, Average Training Loss: 1.5986545085906982, Training Accuracy: 0.23214285714285715
Epoch 1/8, Batch Loss: 1.5844178199768066, Average Training Loss: 1.5968749225139618, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.6250114440917969, Average Training Loss: 1.6000012026892767, Tr

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7353383458646616
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.74      0.72       132
                Educational Opportunity       0.55      0.60      0.57       138
                         Family Support       0.96      0.98      0.97       133
                      Financial Support       0.76      0.81      0.78       130
                 Program Implementation       0.73      0.55      0.63       132

                               accuracy                           0.74       665
                              macro avg       0.74      0.74      0.74       665
                           weighted avg       0.74      0.74      0.73       665

Test Confusion Matrix:
[[ 98  16   2   5  11]
 [ 24  83   3  18  10]
 [  2   1 130   0   0]
 [  5  14   0 105   6]
 [ 10  38   0  11  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted46.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6085189580917358, Average Training Loss: 1.6085189580917358, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.664041519165039, Average Training Loss: 1.6362802386283875, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.5546677112579346, Average Training Loss: 1.6090760628382366, Training Accuracy: 0.20833333333333334
Epoch 1/8, Batch Loss: 1.5991826057434082, Average Training Loss: 1.6066026985645294, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.602702260017395, Average Training Loss: 1.6058226108551026, Training Accuracy: 0.175
Epoch 1/8, Batch Loss: 1.5656778812408447, Average Training Loss: 1.5991318225860596, Training Accuracy: 0.19791666666666666
Epoch 1/8, Batch Loss: 1.5510149002075195, Average Training Loss: 1.5922579765319824, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6190853118896484, Average Training Loss: 1.5956113934516907, Training Accuracy: 0.1953125
Epoch 1/8, Batch Loss: 1.6095384359359741, Average Training Loss: 1.59715884261

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.724812030075188
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.71      0.71       132
                Educational Opportunity       0.53      0.59      0.56       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.72      0.81      0.76       130
                 Program Implementation       0.76      0.55      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.73      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 94  17   3   8  10]
 [ 26  82   4  19   7]
 [  1   1 129   2   0]
 [  4  15   0 105   6]
 [  7  41   0  12  72]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/roB

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted47.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6346349716186523, Average Training Loss: 1.6346349716186523, Training Accuracy: 0.0625
Epoch 1/8, Batch Loss: 1.6191697120666504, Average Training Loss: 1.6269023418426514, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.6150940656661987, Average Training Loss: 1.6229662497838337, Training Accuracy: 0.14583333333333334
Epoch 1/8, Batch Loss: 1.6190412044525146, Average Training Loss: 1.621984988451004, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.5883229970932007, Average Training Loss: 1.6152525901794434, Training Accuracy: 0.1625
Epoch 1/8, Batch Loss: 1.6203334331512451, Average Training Loss: 1.6160993973414104, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.612521767616272, Average Training Loss: 1.6155883073806763, Training Accuracy: 0.16071428571428573
Epoch 1/8, Batch Loss: 1.5712432861328125, Average Training Loss: 1.6100451797246933, Training Accuracy: 0.171875
Epoch 1/8, Batch Loss: 1.6052824258804321, Average Training Loss: 1.609515984

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.72      0.75      0.73       132
                Educational Opportunity       0.50      0.57      0.54       138
                         Family Support       0.94      0.95      0.95       133
                      Financial Support       0.72      0.79      0.75       130
                 Program Implementation       0.77      0.54      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.73      0.72      0.72       665

Test Confusion Matrix:
[[ 99  17   2   6   8]
 [ 29  79   5  18   7]
 [  0   3 127   3   0]
 [  5  15   1 103   6]
 [  5  43   0  13  71]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted48.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6259713172912598, Average Training Loss: 1.6259713172912598, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6129624843597412, Average Training Loss: 1.6194669008255005, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6033564805984497, Average Training Loss: 1.614096760749817, Training Accuracy: 0.23958333333333334
Epoch 1/8, Batch Loss: 1.6268389225006104, Average Training Loss: 1.6172823011875153, Training Accuracy: 0.2265625
Epoch 1/8, Batch Loss: 1.6499080657958984, Average Training Loss: 1.6238074541091918, Training Accuracy: 0.2125
Epoch 1/8, Batch Loss: 1.6186858415603638, Average Training Loss: 1.6229538520177205, Training Accuracy: 0.203125
Epoch 1/8, Batch Loss: 1.5828932523727417, Average Training Loss: 1.617230909211295, Training Accuracy: 0.20982142857142858
Epoch 1/8, Batch Loss: 1.610110878944397, Average Training Loss: 1.6163409054279327, Training Accuracy: 0.20703125
Epoch 1/8, Batch Loss: 1.6110239028930664, Average Training Loss: 1.615

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7413533834586467
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.69      0.73      0.71       132
                Educational Opportunity       0.57      0.60      0.58       138
                         Family Support       0.96      0.98      0.97       133
                      Financial Support       0.74      0.84      0.79       130
                 Program Implementation       0.77      0.56      0.65       132

                               accuracy                           0.74       665
                              macro avg       0.75      0.74      0.74       665
                           weighted avg       0.74      0.74      0.74       665

Test Confusion Matrix:
[[ 96  17   3   6  10]
 [ 27  83   2  19   7]
 [  1   1 131   0   0]
 [  5  11   0 109   5]
 [ 11  34   0  13  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted49.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6229732036590576, Average Training Loss: 1.6229732036590576, Training Accuracy: 0.125
Epoch 1/8, Batch Loss: 1.5952569246292114, Average Training Loss: 1.6091150641441345, Training Accuracy: 0.15625
Epoch 1/8, Batch Loss: 1.5701473951339722, Average Training Loss: 1.596125841140747, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.6044578552246094, Average Training Loss: 1.5982088446617126, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.6086195707321167, Average Training Loss: 1.6002909898757935, Training Accuracy: 0.2
Epoch 1/8, Batch Loss: 1.5738030672073364, Average Training Loss: 1.5958763360977173, Training Accuracy: 0.1875
Epoch 1/8, Batch Loss: 1.5541927814483643, Average Training Loss: 1.589921542576381, Training Accuracy: 0.19642857142857142
Epoch 1/8, Batch Loss: 1.610220193862915, Average Training Loss: 1.5924588739871979, Training Accuracy: 0.1953125
Epoch 1/8, Batch Loss: 1.5862643718719482, Average Training Loss: 1.5917705959743924, Trainin

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7203007518796992
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.73      0.72       132
                Educational Opportunity       0.53      0.57      0.55       138
                         Family Support       0.95      0.96      0.96       133
                      Financial Support       0.70      0.79      0.74       130
                 Program Implementation       0.74      0.55      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.73      0.72      0.72       665
                           weighted avg       0.72      0.72      0.72       665

Test Confusion Matrix:
[[ 97  18   2   7   8]
 [ 25  78   4  21  10]
 [  1   2 128   2   0]
 [  3  16   1 103   7]
 [ 10  34   0  15  73]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted50.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  text = BeautifulSoup(text, 'html.parser').get_text()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.6432632207870483, Average Training Loss: 1.6432632207870483, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.622047781944275, Average Training Loss: 1.6326555013656616, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.5961418151855469, Average Training Loss: 1.6204842726389568, Training Accuracy: 0.25
Epoch 1/8, Batch Loss: 1.5926542282104492, Average Training Loss: 1.6135267615318298, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.6576085090637207, Average Training Loss: 1.622343111038208, Training Accuracy: 0.2375
Epoch 1/8, Batch Loss: 1.6138781309127808, Average Training Loss: 1.6209322810173035, Training Accuracy: 0.22916666666666666
Epoch 1/8, Batch Loss: 1.5996159315109253, Average Training Loss: 1.617887088230678, Training Accuracy: 0.22321428571428573
Epoch 1/8, Batch Loss: 1.6256799697875977, Average Training Loss: 1.618861198425293, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.7464454174041748, Average Training Loss: 1.6330372227562

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7533834586466165
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.70      0.77      0.73       132
                Educational Opportunity       0.59      0.59      0.59       138
                         Family Support       0.95      0.98      0.97       133
                      Financial Support       0.76      0.85      0.80       130
                 Program Implementation       0.79      0.58      0.67       132

                               accuracy                           0.75       665
                              macro avg       0.76      0.75      0.75       665
                           weighted avg       0.76      0.75      0.75       665

Test Confusion Matrix:
[[101  14   3   5   9]
 [ 30  82   3  16   7]
 [  1   1 131   0   0]
 [  4  11   1 110   4]
 [  9  32   0  14  77]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted51.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.5948985815048218, Average Training Loss: 1.5948985815048218, Training Accuracy: 0.28125
Epoch 1/8, Batch Loss: 1.6375396251678467, Average Training Loss: 1.6162191033363342, Training Accuracy: 0.203125
Epoch 1/8, Batch Loss: 1.5906034708023071, Average Training Loss: 1.6076805591583252, Training Accuracy: 0.23958333333333334
Epoch 1/8, Batch Loss: 1.606805682182312, Average Training Loss: 1.607461839914322, Training Accuracy: 0.2578125
Epoch 1/8, Batch Loss: 1.6050140857696533, Average Training Loss: 1.6069722890853881, Training Accuracy: 0.23125
Epoch 1/8, Batch Loss: 1.5972760915756226, Average Training Loss: 1.605356256167094, Training Accuracy: 0.22916666666666666
Epoch 1/8, Batch Loss: 1.6096603870391846, Average Training Loss: 1.605971132005964, Training Accuracy: 0.21428571428571427
Epoch 1/8, Batch Loss: 1.600416660308838, Average Training Loss: 1.6052768230438232, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.5515071153640747, Average Training Lo

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7278195488721805
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.71      0.77      0.74       132
                Educational Opportunity       0.55      0.57      0.56       138
                         Family Support       0.95      0.96      0.96       133
                      Financial Support       0.68      0.79      0.73       130
                 Program Implementation       0.78      0.56      0.65       132

                               accuracy                           0.73       665
                              macro avg       0.73      0.73      0.73       665
                           weighted avg       0.73      0.73      0.73       665

Test Confusion Matrix:
[[101  13   1   9   8]
 [ 27  78   4  22   7]
 [  0   1 128   4   0]
 [  4  15   2 103   6]
 [ 10  34   0  14  74]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from bs4 import BeautifulSoup
from contractions import contractions_dict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
file_path = "/content/drive/MyDrive/Dissertation_UC/UAQTE_Experience_Multi_Class_TC_Datasets.csv"
df = pd.read_csv(file_path)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Expand contractions
    text = ' '.join([contractions_dict.get(word, word) for word in text.split()])

    # Remove irrelevant characters, symbols, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization and lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

df['Processed_Response'] = df['Responses'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_text(df, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['Processed_Response']:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(train_df)
val_inputs, val_masks = tokenize_text(val_df)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_df['Encoded_Label'].values)
val_labels = torch.tensor(val_df['Encoded_Label'].values)

# Create DataLoader for training and validation sets
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Fine-tune RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

# Add optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 8
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    all_preds_train = []
    all_labels_train = []

    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        logits = outputs.logits
        preds = np.argmax(logits.detach().numpy(), axis=1)

        correct_predictions += np.sum(preds == labels.numpy())
        total_samples += len(labels)

        all_preds_train.extend(preds.tolist())
        all_labels_train.extend(labels.numpy().tolist())

        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print average training loss and accuracy for the batch
        avg_train_loss = total_loss / (total_samples / batch_size)
        train_accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Calculate average training loss and accuracy for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')

    # Training classification report
    train_classification_report = classification_report(all_labels_train, all_preds_train, target_names=label_encoder.classes_)
    print('Training Classification Report:')
    print(train_classification_report)

    # Training confusion matrix
    train_conf_matrix = confusion_matrix(all_labels_train, all_preds_train)
    print('Training Confusion Matrix:')
    print(train_conf_matrix)

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()

        preds = np.argmax(logits.detach().numpy(), axis=1)
        predictions.extend(preds)
        true_labels.extend(labels.numpy())

    # Calculate validation accuracy and other metrics
    val_accuracy = accuracy_score(true_labels, predictions)
    val_classification_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print('Validation Classification Report:')
    print(val_classification_report)

    # Validation confusion matrix
    val_conf_matrix = confusion_matrix(true_labels, predictions)
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

# Load and preprocess the test data
test_file_path = "/content/drive/MyDrive/Dissertation_UC/Testing-Dataset-Experience.csv"
test_df = pd.read_csv(test_file_path)
test_df['Processed_Response'] = test_df['Responses'].apply(preprocess_text)

# Tokenize test data
test_inputs, test_masks = tokenize_text(test_df)

# Make predictions on test data
model.eval()
test_predictions = []

for batch in DataLoader(TensorDataset(test_inputs, test_masks), batch_size=batch_size):
    inputs, masks = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
    logits = outputs.logits
    preds = np.argmax(logits.detach().numpy(), axis=1)
    test_predictions.extend(preds)

# Map predictions back to labels
test_df['Predicted Labels'] = label_encoder.inverse_transform(test_predictions)

# Compute test accuracy
test_labels = label_encoder.transform(test_df['True Labels'])
test_accuracy = accuracy_score(test_labels, test_predictions)

# Display test accuracy
print(f'Test Accuracy: {test_accuracy}')

# Calculate other metrics (precision, recall, F1 score, confusion matrix)
test_classification_report = classification_report(test_labels, test_predictions, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

# Display other metrics
print('Test Classification Report:')
print(test_classification_report)
print('Test Confusion Matrix:')
print(test_confusion_matrix)

# Save predictions to a new CSV file
predicted_csv_path = "/content/drive/MyDrive/Dissertation_UC/roBERTaPredicted52.csv"
test_df.to_csv(predicted_csv_path, index=False)

# Display a message indicating that the file has been saved
print(f'Predicted labels saved to {predicted_csv_path}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Batch Loss: 1.5816115140914917, Average Training Loss: 1.5816115140914917, Training Accuracy: 0.3125
Epoch 1/8, Batch Loss: 1.6121684312820435, Average Training Loss: 1.5968899726867676, Training Accuracy: 0.296875
Epoch 1/8, Batch Loss: 1.5989787578582764, Average Training Loss: 1.5975862344106038, Training Accuracy: 0.23958333333333334
Epoch 1/8, Batch Loss: 1.6863021850585938, Average Training Loss: 1.6197652220726013, Training Accuracy: 0.21875
Epoch 1/8, Batch Loss: 1.5734717845916748, Average Training Loss: 1.610506534576416, Training Accuracy: 0.24375
Epoch 1/8, Batch Loss: 1.6033724546432495, Average Training Loss: 1.6093175212542217, Training Accuracy: 0.234375
Epoch 1/8, Batch Loss: 1.5716906785964966, Average Training Loss: 1.6039422580174036, Training Accuracy: 0.23660714285714285
Epoch 1/8, Batch Loss: 1.5953502655029297, Average Training Loss: 1.6028682589530945, Training Accuracy: 0.23046875
Epoch 1/8, Batch Loss: 1.6009290218353271, Average Training Loss: 1.6

  text = BeautifulSoup(text, 'html.parser').get_text()


Test Accuracy: 0.7172932330827068
Test Classification Report:
                                         precision    recall  f1-score   support

Academic Focus and Personal Development       0.68      0.73      0.70       132
                Educational Opportunity       0.53      0.49      0.51       138
                         Family Support       0.95      0.97      0.96       133
                      Financial Support       0.74      0.82      0.78       130
                 Program Implementation       0.68      0.58      0.63       132

                               accuracy                           0.72       665
                              macro avg       0.71      0.72      0.72       665
                           weighted avg       0.71      0.72      0.71       665

Test Confusion Matrix:
[[ 96  16   2   7  11]
 [ 31  68   5  17  17]
 [  1   2 129   1   0]
 [  5  10   0 107   8]
 [  9  33   0  13  77]]
Predicted labels saved to /content/drive/MyDrive/Dissertation_UC/ro